blob: bd3f151c6a50837533961d08d71fc6e407730918 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Eric Snow2ebc5ce2017-09-07 23:51:28 -060043#include "internal/pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050045#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070046#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Larry Hastings61272b72014-01-07 12:41:53 -080052/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090053class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080054[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090055/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
56
57/*[python input]
58class Py_UCS4_converter(CConverter):
59 type = 'Py_UCS4'
60 converter = 'convert_uc'
61
62 def converter_init(self):
63 if self.default is not unspecified:
64 self.c_default = ascii(self.default)
65 if len(self.c_default) > 4 or self.c_default[0] != "'":
66 self.c_default = hex(ord(self.default))
67
68[python start generated code]*/
69/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080070
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000071/* --- Globals ------------------------------------------------------------
72
Serhiy Storchaka05997252013-01-26 12:14:02 +020073NOTE: In the interpreter's initialization phase, some globals are currently
74 initialized dynamically as needed. In the process Unicode objects may
75 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000076
77*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000078
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000079
80#ifdef __cplusplus
81extern "C" {
82#endif
83
Victor Stinner8faf8212011-12-08 22:14:11 +010084/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
85#define MAX_UNICODE 0x10ffff
86
Victor Stinner910337b2011-10-03 03:20:16 +020087#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020088# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020089#else
90# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
91#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020092
Victor Stinnere90fe6a2011-10-01 16:48:13 +020093#define _PyUnicode_UTF8(op) \
94 (((PyCompactUnicodeObject*)(op))->utf8)
95#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020096 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020097 assert(PyUnicode_IS_READY(op)), \
98 PyUnicode_IS_COMPACT_ASCII(op) ? \
99 ((char*)((PyASCIIObject*)(op) + 1)) : \
100 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200101#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 (((PyCompactUnicodeObject*)(op))->utf8_length)
103#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200104 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200105 assert(PyUnicode_IS_READY(op)), \
106 PyUnicode_IS_COMPACT_ASCII(op) ? \
107 ((PyASCIIObject*)(op))->length : \
108 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_WSTR(op) \
110 (((PyASCIIObject*)(op))->wstr)
111#define _PyUnicode_WSTR_LENGTH(op) \
112 (((PyCompactUnicodeObject*)(op))->wstr_length)
113#define _PyUnicode_LENGTH(op) \
114 (((PyASCIIObject *)(op))->length)
115#define _PyUnicode_STATE(op) \
116 (((PyASCIIObject *)(op))->state)
117#define _PyUnicode_HASH(op) \
118 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200119#define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200121 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200122#define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200125#define _PyUnicode_DATA_ANY(op) \
126 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200127
Victor Stinner910337b2011-10-03 03:20:16 +0200128#undef PyUnicode_READY
129#define PyUnicode_READY(op) \
130 (assert(_PyUnicode_CHECK(op)), \
131 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200132 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100133 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200134
Victor Stinnerc379ead2011-10-03 12:52:27 +0200135#define _PyUnicode_SHARE_UTF8(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
138 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
139#define _PyUnicode_SHARE_WSTR(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
142
Victor Stinner829c0ad2011-10-03 01:08:02 +0200143/* true if the Unicode object has an allocated UTF-8 memory block
144 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200145#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200146 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200147 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200148 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
149
Victor Stinner03490912011-10-03 23:45:12 +0200150/* true if the Unicode object has an allocated wstr memory block
151 (not shared with other data) */
152#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200153 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200154 (!PyUnicode_IS_READY(op) || \
155 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
156
Victor Stinner910337b2011-10-03 03:20:16 +0200157/* Generic helper macro to convert characters of different types.
158 from_type and to_type have to be valid type names, begin and end
159 are pointers to the source characters which should be of type
160 "from_type *". to is a pointer of type "to_type *" and points to the
161 buffer where the result characters are written to. */
162#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
163 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100164 to_type *_to = (to_type *)(to); \
165 const from_type *_iter = (from_type *)(begin); \
166 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200167 Py_ssize_t n = (_end) - (_iter); \
168 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200169 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_unrolled_end)) { \
171 _to[0] = (to_type) _iter[0]; \
172 _to[1] = (to_type) _iter[1]; \
173 _to[2] = (to_type) _iter[2]; \
174 _to[3] = (to_type) _iter[3]; \
175 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200176 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200177 while (_iter < (_end)) \
178 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200179 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200180
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200181#ifdef MS_WINDOWS
182 /* On Windows, overallocate by 50% is the best factor */
183# define OVERALLOCATE_FACTOR 2
184#else
185 /* On Linux, overallocate by 25% is the best factor */
186# define OVERALLOCATE_FACTOR 4
187#endif
188
Walter Dörwald16807132007-05-25 13:52:07 +0000189/* This dictionary holds all interned unicode strings. Note that references
190 to strings in this dictionary are *not* counted in the string's ob_refcnt.
191 When the interned string reaches a refcnt of 0 the string deallocation
192 function will delete the reference from this dictionary.
193
194 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000195 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000196*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000198
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200201
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200203 do { \
204 if (unicode_empty != NULL) \
205 Py_INCREF(unicode_empty); \
206 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200207 unicode_empty = PyUnicode_New(0, 0); \
208 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200209 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200210 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
211 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200213 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000214
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215#define _Py_RETURN_UNICODE_EMPTY() \
216 do { \
217 _Py_INCREF_UNICODE_EMPTY(); \
218 return unicode_empty; \
219 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000220
Victor Stinner6f5fa1b2018-11-26 14:17:01 +0100221#define FILL(kind, data, value, start, length) \
222 do { \
Victor Stinner7f9fb0f2018-11-27 12:42:04 +0100223 assert(0 <= start); \
Victor Stinner6f5fa1b2018-11-26 14:17:01 +0100224 assert(kind != PyUnicode_WCHAR_KIND); \
Victor Stinner7f9fb0f2018-11-27 12:42:04 +0100225 switch (kind) { \
Victor Stinner6f5fa1b2018-11-26 14:17:01 +0100226 case PyUnicode_1BYTE_KIND: { \
Victor Stinner7f9fb0f2018-11-27 12:42:04 +0100227 assert(value <= 0xff); \
228 Py_UCS1 ch = (unsigned char)value; \
229 Py_UCS1 *to = (Py_UCS1 *)data + start; \
230 memset(to, ch, length); \
Victor Stinner6f5fa1b2018-11-26 14:17:01 +0100231 break; \
232 } \
233 case PyUnicode_2BYTE_KIND: { \
Victor Stinner7f9fb0f2018-11-27 12:42:04 +0100234 assert(value <= 0xffff); \
235 Py_UCS2 ch = (Py_UCS2)value; \
236 Py_UCS2 *to = (Py_UCS2 *)data + start; \
237 const Py_UCS2 *end = to + length; \
238 for (; to < end; ++to) *to = ch; \
Victor Stinner6f5fa1b2018-11-26 14:17:01 +0100239 break; \
240 } \
241 case PyUnicode_4BYTE_KIND: { \
Victor Stinner7f9fb0f2018-11-27 12:42:04 +0100242 assert(value <= MAX_UNICODE); \
243 Py_UCS4 ch = value; \
244 Py_UCS4 * to = (Py_UCS4 *)data + start; \
245 const Py_UCS4 *end = to + length; \
246 for (; to < end; ++to) *to = ch; \
Victor Stinner6f5fa1b2018-11-26 14:17:01 +0100247 break; \
248 } \
249 default: Py_UNREACHABLE(); \
250 } \
251 } while (0)
252
253
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200254/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700255static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200256_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
257
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200258/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200259static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200260
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000261/* Single character Unicode strings in the Latin-1 range are being
262 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200263static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000264
Christian Heimes190d79e2008-01-30 11:58:22 +0000265/* Fast detection of the most frequent whitespace characters */
266const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000268/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000269/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000270/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000271/* case 0x000C: * FORM FEED */
272/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 1, 1, 1, 1, 1, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000275/* case 0x001C: * FILE SEPARATOR */
276/* case 0x001D: * GROUP SEPARATOR */
277/* case 0x001E: * RECORD SEPARATOR */
278/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000281 1, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000285
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
291 0, 0, 0, 0, 0, 0, 0, 0,
292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000294};
295
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200296/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200297static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200298static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100299static int unicode_modifiable(PyObject *unicode);
300
Victor Stinnerfe226c02011-10-03 03:52:20 +0200301
Alexander Belopolsky40018472011-02-26 01:02:56 +0000302static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100303_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200304static PyObject *
305_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
306static PyObject *
307_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
308
309static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000310unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000311 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100312 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000313 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
314
Alexander Belopolsky40018472011-02-26 01:02:56 +0000315static void
316raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300317 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100318 PyObject *unicode,
319 Py_ssize_t startpos, Py_ssize_t endpos,
320 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000321
Christian Heimes190d79e2008-01-30 11:58:22 +0000322/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200323static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000324 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000325/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000326/* 0x000B, * LINE TABULATION */
327/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000328/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000329 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000330 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000331/* 0x001C, * FILE SEPARATOR */
332/* 0x001D, * GROUP SEPARATOR */
333/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000334 0, 0, 0, 0, 1, 1, 1, 0,
335 0, 0, 0, 0, 0, 0, 0, 0,
336 0, 0, 0, 0, 0, 0, 0, 0,
337 0, 0, 0, 0, 0, 0, 0, 0,
338 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000339
Benjamin Peterson14339b62009-01-31 16:36:08 +0000340 0, 0, 0, 0, 0, 0, 0, 0,
341 0, 0, 0, 0, 0, 0, 0, 0,
342 0, 0, 0, 0, 0, 0, 0, 0,
343 0, 0, 0, 0, 0, 0, 0, 0,
344 0, 0, 0, 0, 0, 0, 0, 0,
345 0, 0, 0, 0, 0, 0, 0, 0,
346 0, 0, 0, 0, 0, 0, 0, 0,
347 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000348};
349
INADA Naoki3ae20562017-01-16 20:41:20 +0900350static int convert_uc(PyObject *obj, void *addr);
351
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300352#include "clinic/unicodeobject.c.h"
353
Victor Stinner50149202015-09-22 00:26:54 +0200354typedef enum {
355 _Py_ERROR_UNKNOWN=0,
356 _Py_ERROR_STRICT,
357 _Py_ERROR_SURROGATEESCAPE,
358 _Py_ERROR_REPLACE,
359 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200360 _Py_ERROR_BACKSLASHREPLACE,
361 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200362 _Py_ERROR_XMLCHARREFREPLACE,
363 _Py_ERROR_OTHER
364} _Py_error_handler;
365
366static _Py_error_handler
367get_error_handler(const char *errors)
368{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200369 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200370 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200371 }
372 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200373 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200374 }
375 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200376 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200377 }
378 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200379 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200380 }
381 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200382 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200383 }
384 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200385 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200386 }
387 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200388 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200389 }
Victor Stinner50149202015-09-22 00:26:54 +0200390 return _Py_ERROR_OTHER;
391}
392
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300393/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
394 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000395Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000396PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000397{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000398#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000399 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000400#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000401 /* This is actually an illegal character, so it should
402 not be passed to unichr. */
403 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000404#endif
405}
406
Victor Stinner910337b2011-10-03 03:20:16 +0200407#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200408int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100409_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200410{
411 PyASCIIObject *ascii;
412 unsigned int kind;
413
414 assert(PyUnicode_Check(op));
415
416 ascii = (PyASCIIObject *)op;
417 kind = ascii->state.kind;
418
Victor Stinnera3b334d2011-10-03 13:53:37 +0200419 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200420 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200421 assert(ascii->state.ready == 1);
422 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200423 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200424 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200425 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200426
Victor Stinnera41463c2011-10-04 01:05:08 +0200427 if (ascii->state.compact == 1) {
428 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200429 assert(kind == PyUnicode_1BYTE_KIND
430 || kind == PyUnicode_2BYTE_KIND
431 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200432 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200433 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200434 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100435 }
436 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200437 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
438
439 data = unicode->data.any;
440 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100441 assert(ascii->length == 0);
442 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200443 assert(ascii->state.compact == 0);
444 assert(ascii->state.ascii == 0);
445 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100446 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200447 assert(ascii->wstr != NULL);
448 assert(data == NULL);
449 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200450 }
451 else {
452 assert(kind == PyUnicode_1BYTE_KIND
453 || kind == PyUnicode_2BYTE_KIND
454 || kind == PyUnicode_4BYTE_KIND);
455 assert(ascii->state.compact == 0);
456 assert(ascii->state.ready == 1);
457 assert(data != NULL);
458 if (ascii->state.ascii) {
459 assert (compact->utf8 == data);
460 assert (compact->utf8_length == ascii->length);
461 }
462 else
463 assert (compact->utf8 != data);
464 }
465 }
466 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200467 if (
468#if SIZEOF_WCHAR_T == 2
469 kind == PyUnicode_2BYTE_KIND
470#else
471 kind == PyUnicode_4BYTE_KIND
472#endif
473 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200474 {
475 assert(ascii->wstr == data);
476 assert(compact->wstr_length == ascii->length);
477 } else
478 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200479 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200480
481 if (compact->utf8 == NULL)
482 assert(compact->utf8_length == 0);
483 if (ascii->wstr == NULL)
484 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200485 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200486 /* check that the best kind is used */
487 if (check_content && kind != PyUnicode_WCHAR_KIND)
488 {
489 Py_ssize_t i;
490 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200491 void *data;
492 Py_UCS4 ch;
493
494 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200495 for (i=0; i < ascii->length; i++)
496 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200497 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200498 if (ch > maxchar)
499 maxchar = ch;
500 }
501 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100502 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200503 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100504 assert(maxchar <= 255);
505 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200506 else
507 assert(maxchar < 128);
508 }
Victor Stinner77faf692011-11-20 18:56:05 +0100509 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200510 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100511 assert(maxchar <= 0xFFFF);
512 }
513 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200514 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100515 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100516 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200517 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200518 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400519 return 1;
520}
Victor Stinner910337b2011-10-03 03:20:16 +0200521#endif
522
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100523static PyObject*
524unicode_result_wchar(PyObject *unicode)
525{
526#ifndef Py_DEBUG
527 Py_ssize_t len;
528
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100529 len = _PyUnicode_WSTR_LENGTH(unicode);
530 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100531 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200532 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100533 }
534
535 if (len == 1) {
536 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100537 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100538 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
539 Py_DECREF(unicode);
540 return latin1_char;
541 }
542 }
543
544 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200545 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100546 return NULL;
547 }
548#else
Victor Stinneraa771272012-10-04 02:32:58 +0200549 assert(Py_REFCNT(unicode) == 1);
550
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100551 /* don't make the result ready in debug mode to ensure that the caller
552 makes the string ready before using it */
553 assert(_PyUnicode_CheckConsistency(unicode, 1));
554#endif
555 return unicode;
556}
557
558static PyObject*
559unicode_result_ready(PyObject *unicode)
560{
561 Py_ssize_t length;
562
563 length = PyUnicode_GET_LENGTH(unicode);
564 if (length == 0) {
565 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100566 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200567 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100568 }
569 return unicode_empty;
570 }
571
572 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200573 void *data = PyUnicode_DATA(unicode);
574 int kind = PyUnicode_KIND(unicode);
575 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100576 if (ch < 256) {
577 PyObject *latin1_char = unicode_latin1[ch];
578 if (latin1_char != NULL) {
579 if (unicode != latin1_char) {
580 Py_INCREF(latin1_char);
581 Py_DECREF(unicode);
582 }
583 return latin1_char;
584 }
585 else {
586 assert(_PyUnicode_CheckConsistency(unicode, 1));
587 Py_INCREF(unicode);
588 unicode_latin1[ch] = unicode;
589 return unicode;
590 }
591 }
592 }
593
594 assert(_PyUnicode_CheckConsistency(unicode, 1));
595 return unicode;
596}
597
598static PyObject*
599unicode_result(PyObject *unicode)
600{
601 assert(_PyUnicode_CHECK(unicode));
602 if (PyUnicode_IS_READY(unicode))
603 return unicode_result_ready(unicode);
604 else
605 return unicode_result_wchar(unicode);
606}
607
Victor Stinnerc4b49542011-12-11 22:44:26 +0100608static PyObject*
609unicode_result_unchanged(PyObject *unicode)
610{
611 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500612 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100613 return NULL;
614 Py_INCREF(unicode);
615 return unicode;
616 }
617 else
618 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100619 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100620}
621
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200622/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
623 ASCII, Latin1, UTF-8, etc. */
624static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200625backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200626 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
627{
Victor Stinnerad771582015-10-09 12:38:53 +0200628 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200629 Py_UCS4 ch;
630 enum PyUnicode_Kind kind;
631 void *data;
632
633 assert(PyUnicode_IS_READY(unicode));
634 kind = PyUnicode_KIND(unicode);
635 data = PyUnicode_DATA(unicode);
636
637 size = 0;
638 /* determine replacement size */
639 for (i = collstart; i < collend; ++i) {
640 Py_ssize_t incr;
641
642 ch = PyUnicode_READ(kind, data, i);
643 if (ch < 0x100)
644 incr = 2+2;
645 else if (ch < 0x10000)
646 incr = 2+4;
647 else {
648 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200649 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200650 }
651 if (size > PY_SSIZE_T_MAX - incr) {
652 PyErr_SetString(PyExc_OverflowError,
653 "encoded result is too long for a Python string");
654 return NULL;
655 }
656 size += incr;
657 }
658
Victor Stinnerad771582015-10-09 12:38:53 +0200659 str = _PyBytesWriter_Prepare(writer, str, size);
660 if (str == NULL)
661 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200662
663 /* generate replacement */
664 for (i = collstart; i < collend; ++i) {
665 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200666 *str++ = '\\';
667 if (ch >= 0x00010000) {
668 *str++ = 'U';
669 *str++ = Py_hexdigits[(ch>>28)&0xf];
670 *str++ = Py_hexdigits[(ch>>24)&0xf];
671 *str++ = Py_hexdigits[(ch>>20)&0xf];
672 *str++ = Py_hexdigits[(ch>>16)&0xf];
673 *str++ = Py_hexdigits[(ch>>12)&0xf];
674 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200675 }
Victor Stinner797485e2015-10-09 03:17:30 +0200676 else if (ch >= 0x100) {
677 *str++ = 'u';
678 *str++ = Py_hexdigits[(ch>>12)&0xf];
679 *str++ = Py_hexdigits[(ch>>8)&0xf];
680 }
681 else
682 *str++ = 'x';
683 *str++ = Py_hexdigits[(ch>>4)&0xf];
684 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200685 }
686 return str;
687}
688
689/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
690 ASCII, Latin1, UTF-8, etc. */
691static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200692xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200693 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
694{
Victor Stinnerad771582015-10-09 12:38:53 +0200695 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200696 Py_UCS4 ch;
697 enum PyUnicode_Kind kind;
698 void *data;
699
700 assert(PyUnicode_IS_READY(unicode));
701 kind = PyUnicode_KIND(unicode);
702 data = PyUnicode_DATA(unicode);
703
704 size = 0;
705 /* determine replacement size */
706 for (i = collstart; i < collend; ++i) {
707 Py_ssize_t incr;
708
709 ch = PyUnicode_READ(kind, data, i);
710 if (ch < 10)
711 incr = 2+1+1;
712 else if (ch < 100)
713 incr = 2+2+1;
714 else if (ch < 1000)
715 incr = 2+3+1;
716 else if (ch < 10000)
717 incr = 2+4+1;
718 else if (ch < 100000)
719 incr = 2+5+1;
720 else if (ch < 1000000)
721 incr = 2+6+1;
722 else {
723 assert(ch <= MAX_UNICODE);
724 incr = 2+7+1;
725 }
726 if (size > PY_SSIZE_T_MAX - incr) {
727 PyErr_SetString(PyExc_OverflowError,
728 "encoded result is too long for a Python string");
729 return NULL;
730 }
731 size += incr;
732 }
733
Victor Stinnerad771582015-10-09 12:38:53 +0200734 str = _PyBytesWriter_Prepare(writer, str, size);
735 if (str == NULL)
736 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200737
738 /* generate replacement */
739 for (i = collstart; i < collend; ++i) {
740 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
741 }
742 return str;
743}
744
Thomas Wouters477c8d52006-05-27 19:21:47 +0000745/* --- Bloom Filters ----------------------------------------------------- */
746
747/* stuff to implement simple "bloom filters" for Unicode characters.
748 to keep things simple, we use a single bitmask, using the least 5
749 bits from each unicode characters as the bit index. */
750
751/* the linebreak mask is set up by Unicode_Init below */
752
Antoine Pitrouf068f942010-01-13 14:19:12 +0000753#if LONG_BIT >= 128
754#define BLOOM_WIDTH 128
755#elif LONG_BIT >= 64
756#define BLOOM_WIDTH 64
757#elif LONG_BIT >= 32
758#define BLOOM_WIDTH 32
759#else
760#error "LONG_BIT is smaller than 32"
761#endif
762
Thomas Wouters477c8d52006-05-27 19:21:47 +0000763#define BLOOM_MASK unsigned long
764
Serhiy Storchaka05997252013-01-26 12:14:02 +0200765static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000766
Antoine Pitrouf068f942010-01-13 14:19:12 +0000767#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000768
Benjamin Peterson29060642009-01-31 22:14:21 +0000769#define BLOOM_LINEBREAK(ch) \
770 ((ch) < 128U ? ascii_linebreak[(ch)] : \
771 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000772
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700773static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200774make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000775{
Victor Stinnera85af502013-04-09 21:53:54 +0200776#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
777 do { \
778 TYPE *data = (TYPE *)PTR; \
779 TYPE *end = data + LEN; \
780 Py_UCS4 ch; \
781 for (; data != end; data++) { \
782 ch = *data; \
783 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
784 } \
785 break; \
786 } while (0)
787
Thomas Wouters477c8d52006-05-27 19:21:47 +0000788 /* calculate simple bloom-style bitmask for a given unicode string */
789
Antoine Pitrouf068f942010-01-13 14:19:12 +0000790 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000791
792 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200793 switch (kind) {
794 case PyUnicode_1BYTE_KIND:
795 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
796 break;
797 case PyUnicode_2BYTE_KIND:
798 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
799 break;
800 case PyUnicode_4BYTE_KIND:
801 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
802 break;
803 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700804 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200805 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000806 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200807
808#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000809}
810
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300811static int
812ensure_unicode(PyObject *obj)
813{
814 if (!PyUnicode_Check(obj)) {
815 PyErr_Format(PyExc_TypeError,
816 "must be str, not %.100s",
817 Py_TYPE(obj)->tp_name);
818 return -1;
819 }
820 return PyUnicode_READY(obj);
821}
822
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200823/* Compilation of templated routines */
824
825#include "stringlib/asciilib.h"
826#include "stringlib/fastsearch.h"
827#include "stringlib/partition.h"
828#include "stringlib/split.h"
829#include "stringlib/count.h"
830#include "stringlib/find.h"
831#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200832#include "stringlib/undef.h"
833
834#include "stringlib/ucs1lib.h"
835#include "stringlib/fastsearch.h"
836#include "stringlib/partition.h"
837#include "stringlib/split.h"
838#include "stringlib/count.h"
839#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300840#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200841#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200842#include "stringlib/undef.h"
843
844#include "stringlib/ucs2lib.h"
845#include "stringlib/fastsearch.h"
846#include "stringlib/partition.h"
847#include "stringlib/split.h"
848#include "stringlib/count.h"
849#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300850#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200851#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200852#include "stringlib/undef.h"
853
854#include "stringlib/ucs4lib.h"
855#include "stringlib/fastsearch.h"
856#include "stringlib/partition.h"
857#include "stringlib/split.h"
858#include "stringlib/count.h"
859#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300860#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200861#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200862#include "stringlib/undef.h"
863
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200864#include "stringlib/unicodedefs.h"
865#include "stringlib/fastsearch.h"
866#include "stringlib/count.h"
867#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100868#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200869
Guido van Rossumd57fd912000-03-10 22:53:23 +0000870/* --- Unicode Object ----------------------------------------------------- */
871
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700872static inline Py_ssize_t
873findchar(const void *s, int kind,
874 Py_ssize_t size, Py_UCS4 ch,
875 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200876{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200877 switch (kind) {
878 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200879 if ((Py_UCS1) ch != ch)
880 return -1;
881 if (direction > 0)
882 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
883 else
884 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200885 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200886 if ((Py_UCS2) ch != ch)
887 return -1;
888 if (direction > 0)
889 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
890 else
891 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200892 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200893 if (direction > 0)
894 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
895 else
896 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200897 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700898 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200899 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200900}
901
Victor Stinnerafffce42012-10-03 23:03:17 +0200902#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000903/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200904 earlier.
905
906 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
907 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
908 invalid character in Unicode 6.0. */
909static void
910unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
911{
912 int kind = PyUnicode_KIND(unicode);
913 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
914 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
915 if (length <= old_length)
916 return;
917 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
918}
919#endif
920
Victor Stinnerfe226c02011-10-03 03:52:20 +0200921static PyObject*
922resize_compact(PyObject *unicode, Py_ssize_t length)
923{
924 Py_ssize_t char_size;
925 Py_ssize_t struct_size;
926 Py_ssize_t new_size;
927 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100928 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200929#ifdef Py_DEBUG
930 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
931#endif
932
Victor Stinner79891572012-05-03 13:43:07 +0200933 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200934 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100935 assert(PyUnicode_IS_COMPACT(unicode));
936
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200937 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100938 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200939 struct_size = sizeof(PyASCIIObject);
940 else
941 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200942 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200943
Victor Stinnerfe226c02011-10-03 03:52:20 +0200944 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
945 PyErr_NoMemory();
946 return NULL;
947 }
948 new_size = (struct_size + (length + 1) * char_size);
949
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200950 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
951 PyObject_DEL(_PyUnicode_UTF8(unicode));
952 _PyUnicode_UTF8(unicode) = NULL;
953 _PyUnicode_UTF8_LENGTH(unicode) = 0;
954 }
Victor Stinner84def372011-12-11 20:04:56 +0100955 _Py_DEC_REFTOTAL;
956 _Py_ForgetReference(unicode);
957
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300958 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100959 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100960 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961 PyErr_NoMemory();
962 return NULL;
963 }
Victor Stinner84def372011-12-11 20:04:56 +0100964 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200965 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100966
Victor Stinnerfe226c02011-10-03 03:52:20 +0200967 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200968 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200969 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100970 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200971 _PyUnicode_WSTR_LENGTH(unicode) = length;
972 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100973 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
974 PyObject_DEL(_PyUnicode_WSTR(unicode));
975 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100976 if (!PyUnicode_IS_ASCII(unicode))
977 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100978 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200979#ifdef Py_DEBUG
980 unicode_fill_invalid(unicode, old_length);
981#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200982 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
983 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200984 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200985 return unicode;
986}
987
Alexander Belopolsky40018472011-02-26 01:02:56 +0000988static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200989resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000990{
Victor Stinner95663112011-10-04 01:03:50 +0200991 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100992 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200993 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200994 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000995
Victor Stinnerfe226c02011-10-03 03:52:20 +0200996 if (PyUnicode_IS_READY(unicode)) {
997 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200998 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200999 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001000#ifdef Py_DEBUG
1001 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1002#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003
1004 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001005 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001006 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1007 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001008
1009 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1010 PyErr_NoMemory();
1011 return -1;
1012 }
1013 new_size = (length + 1) * char_size;
1014
Victor Stinner7a9105a2011-12-12 00:13:42 +01001015 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1016 {
1017 PyObject_DEL(_PyUnicode_UTF8(unicode));
1018 _PyUnicode_UTF8(unicode) = NULL;
1019 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1020 }
1021
Victor Stinnerfe226c02011-10-03 03:52:20 +02001022 data = (PyObject *)PyObject_REALLOC(data, new_size);
1023 if (data == NULL) {
1024 PyErr_NoMemory();
1025 return -1;
1026 }
1027 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001028 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001029 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001030 _PyUnicode_WSTR_LENGTH(unicode) = length;
1031 }
1032 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001033 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001034 _PyUnicode_UTF8_LENGTH(unicode) = length;
1035 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001036 _PyUnicode_LENGTH(unicode) = length;
1037 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001038#ifdef Py_DEBUG
1039 unicode_fill_invalid(unicode, old_length);
1040#endif
Victor Stinner95663112011-10-04 01:03:50 +02001041 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001042 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001043 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001044 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001045 }
Victor Stinner95663112011-10-04 01:03:50 +02001046 assert(_PyUnicode_WSTR(unicode) != NULL);
1047
1048 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001049 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001050 PyErr_NoMemory();
1051 return -1;
1052 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001053 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001054 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001055 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001056 if (!wstr) {
1057 PyErr_NoMemory();
1058 return -1;
1059 }
1060 _PyUnicode_WSTR(unicode) = wstr;
1061 _PyUnicode_WSTR(unicode)[length] = 0;
1062 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001063 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001064 return 0;
1065}
1066
Victor Stinnerfe226c02011-10-03 03:52:20 +02001067static PyObject*
1068resize_copy(PyObject *unicode, Py_ssize_t length)
1069{
1070 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001071 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001072 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001073
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001074 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001075
1076 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1077 if (copy == NULL)
1078 return NULL;
1079
1080 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001081 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001082 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001083 }
1084 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001085 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001086
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001087 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001088 if (w == NULL)
1089 return NULL;
1090 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1091 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001092 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001093 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001094 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001095 }
1096}
1097
Guido van Rossumd57fd912000-03-10 22:53:23 +00001098/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001099 Ux0000 terminated; some code (e.g. new_identifier)
1100 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101
1102 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001103 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001104
1105*/
1106
Alexander Belopolsky40018472011-02-26 01:02:56 +00001107static PyUnicodeObject *
1108_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001109{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001110 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112
Thomas Wouters477c8d52006-05-27 19:21:47 +00001113 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001114 if (length == 0 && unicode_empty != NULL) {
1115 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001116 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001117 }
1118
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001119 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001120 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001121 return (PyUnicodeObject *)PyErr_NoMemory();
1122 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123 if (length < 0) {
1124 PyErr_SetString(PyExc_SystemError,
1125 "Negative size passed to _PyUnicode_New");
1126 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001127 }
1128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1130 if (unicode == NULL)
1131 return NULL;
1132 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001133
1134 _PyUnicode_WSTR_LENGTH(unicode) = length;
1135 _PyUnicode_HASH(unicode) = -1;
1136 _PyUnicode_STATE(unicode).interned = 0;
1137 _PyUnicode_STATE(unicode).kind = 0;
1138 _PyUnicode_STATE(unicode).compact = 0;
1139 _PyUnicode_STATE(unicode).ready = 0;
1140 _PyUnicode_STATE(unicode).ascii = 0;
1141 _PyUnicode_DATA_ANY(unicode) = NULL;
1142 _PyUnicode_LENGTH(unicode) = 0;
1143 _PyUnicode_UTF8(unicode) = NULL;
1144 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1147 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001148 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001149 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001150 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001151 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001152
Jeremy Hyltond8082792003-09-16 19:41:39 +00001153 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001154 * the caller fails before initializing str -- unicode_resize()
1155 * reads str[0], and the Keep-Alive optimization can keep memory
1156 * allocated for str alive across a call to unicode_dealloc(unicode).
1157 * We don't want unicode_resize to read uninitialized memory in
1158 * that case.
1159 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001160 _PyUnicode_WSTR(unicode)[0] = 0;
1161 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001162
Victor Stinner7931d9a2011-11-04 00:22:48 +01001163 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001164 return unicode;
1165}
1166
Victor Stinnerf42dc442011-10-02 23:33:16 +02001167static const char*
1168unicode_kind_name(PyObject *unicode)
1169{
Victor Stinner42dfd712011-10-03 14:41:45 +02001170 /* don't check consistency: unicode_kind_name() is called from
1171 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001172 if (!PyUnicode_IS_COMPACT(unicode))
1173 {
1174 if (!PyUnicode_IS_READY(unicode))
1175 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001176 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001177 {
1178 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001179 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001180 return "legacy ascii";
1181 else
1182 return "legacy latin1";
1183 case PyUnicode_2BYTE_KIND:
1184 return "legacy UCS2";
1185 case PyUnicode_4BYTE_KIND:
1186 return "legacy UCS4";
1187 default:
1188 return "<legacy invalid kind>";
1189 }
1190 }
1191 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001192 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001193 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001194 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001195 return "ascii";
1196 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001197 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001198 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001199 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001200 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001201 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001202 default:
1203 return "<invalid compact kind>";
1204 }
1205}
1206
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001207#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001208/* Functions wrapping macros for use in debugger */
1209char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001210 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001211}
1212
1213void *_PyUnicode_compact_data(void *unicode) {
1214 return _PyUnicode_COMPACT_DATA(unicode);
1215}
1216void *_PyUnicode_data(void *unicode){
1217 printf("obj %p\n", unicode);
1218 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1219 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1220 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1221 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1222 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1223 return PyUnicode_DATA(unicode);
1224}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001225
1226void
1227_PyUnicode_Dump(PyObject *op)
1228{
1229 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001230 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1231 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1232 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001233
Victor Stinnera849a4b2011-10-03 12:12:11 +02001234 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001235 {
1236 if (ascii->state.ascii)
1237 data = (ascii + 1);
1238 else
1239 data = (compact + 1);
1240 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001241 else
1242 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001243 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1244 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001245
Victor Stinnera849a4b2011-10-03 12:12:11 +02001246 if (ascii->wstr == data)
1247 printf("shared ");
1248 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001249
Victor Stinnera3b334d2011-10-03 13:53:37 +02001250 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001251 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001252 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1253 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001254 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1255 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001256 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001257 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001258}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001259#endif
1260
1261PyObject *
1262PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1263{
1264 PyObject *obj;
1265 PyCompactUnicodeObject *unicode;
1266 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001267 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001268 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001269 Py_ssize_t char_size;
1270 Py_ssize_t struct_size;
1271
1272 /* Optimization for empty strings */
1273 if (size == 0 && unicode_empty != NULL) {
1274 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001275 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001276 }
1277
Victor Stinner9e9d6892011-10-04 01:02:02 +02001278 is_ascii = 0;
1279 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001280 struct_size = sizeof(PyCompactUnicodeObject);
1281 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001282 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001283 char_size = 1;
1284 is_ascii = 1;
1285 struct_size = sizeof(PyASCIIObject);
1286 }
1287 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001288 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001289 char_size = 1;
1290 }
1291 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001292 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001293 char_size = 2;
1294 if (sizeof(wchar_t) == 2)
1295 is_sharing = 1;
1296 }
1297 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001298 if (maxchar > MAX_UNICODE) {
1299 PyErr_SetString(PyExc_SystemError,
1300 "invalid maximum character passed to PyUnicode_New");
1301 return NULL;
1302 }
Victor Stinner8f825062012-04-27 13:55:39 +02001303 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001304 char_size = 4;
1305 if (sizeof(wchar_t) == 4)
1306 is_sharing = 1;
1307 }
1308
1309 /* Ensure we won't overflow the size. */
1310 if (size < 0) {
1311 PyErr_SetString(PyExc_SystemError,
1312 "Negative size passed to PyUnicode_New");
1313 return NULL;
1314 }
1315 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1316 return PyErr_NoMemory();
1317
1318 /* Duplicated allocation code from _PyObject_New() instead of a call to
1319 * PyObject_New() so we are able to allocate space for the object and
1320 * it's data buffer.
1321 */
1322 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1323 if (obj == NULL)
1324 return PyErr_NoMemory();
1325 obj = PyObject_INIT(obj, &PyUnicode_Type);
1326 if (obj == NULL)
1327 return NULL;
1328
1329 unicode = (PyCompactUnicodeObject *)obj;
1330 if (is_ascii)
1331 data = ((PyASCIIObject*)obj) + 1;
1332 else
1333 data = unicode + 1;
1334 _PyUnicode_LENGTH(unicode) = size;
1335 _PyUnicode_HASH(unicode) = -1;
1336 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001337 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338 _PyUnicode_STATE(unicode).compact = 1;
1339 _PyUnicode_STATE(unicode).ready = 1;
1340 _PyUnicode_STATE(unicode).ascii = is_ascii;
1341 if (is_ascii) {
1342 ((char*)data)[size] = 0;
1343 _PyUnicode_WSTR(unicode) = NULL;
1344 }
Victor Stinner8f825062012-04-27 13:55:39 +02001345 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346 ((char*)data)[size] = 0;
1347 _PyUnicode_WSTR(unicode) = NULL;
1348 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001350 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001351 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001352 else {
1353 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001354 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001355 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001356 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001357 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001358 ((Py_UCS4*)data)[size] = 0;
1359 if (is_sharing) {
1360 _PyUnicode_WSTR_LENGTH(unicode) = size;
1361 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1362 }
1363 else {
1364 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1365 _PyUnicode_WSTR(unicode) = NULL;
1366 }
1367 }
Victor Stinner8f825062012-04-27 13:55:39 +02001368#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001369 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001370#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001371 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372 return obj;
1373}
1374
1375#if SIZEOF_WCHAR_T == 2
1376/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1377 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001378 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001379
1380 This function assumes that unicode can hold one more code point than wstr
1381 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001382static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001384 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385{
1386 const wchar_t *iter;
1387 Py_UCS4 *ucs4_out;
1388
Victor Stinner910337b2011-10-03 03:20:16 +02001389 assert(unicode != NULL);
1390 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1392 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1393
1394 for (iter = begin; iter < end; ) {
1395 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1396 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001397 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1398 && (iter+1) < end
1399 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400 {
Victor Stinner551ac952011-11-29 22:58:13 +01001401 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001402 iter += 2;
1403 }
1404 else {
1405 *ucs4_out++ = *iter;
1406 iter++;
1407 }
1408 }
1409 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1410 _PyUnicode_GET_LENGTH(unicode)));
1411
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412}
1413#endif
1414
Victor Stinnercd9950f2011-10-02 00:34:53 +02001415static int
Victor Stinner488fa492011-12-12 00:01:39 +01001416unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001417{
Victor Stinner488fa492011-12-12 00:01:39 +01001418 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001419 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001420 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001421 return -1;
1422 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001423 return 0;
1424}
1425
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001426static int
1427_copy_characters(PyObject *to, Py_ssize_t to_start,
1428 PyObject *from, Py_ssize_t from_start,
1429 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001431 unsigned int from_kind, to_kind;
1432 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433
Victor Stinneree4544c2012-05-09 22:24:08 +02001434 assert(0 <= how_many);
1435 assert(0 <= from_start);
1436 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001437 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001438 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001439 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440
Victor Stinnerd3f08822012-05-29 12:57:52 +02001441 assert(PyUnicode_Check(to));
1442 assert(PyUnicode_IS_READY(to));
1443 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1444
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001445 if (how_many == 0)
1446 return 0;
1447
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001448 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001449 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001450 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001451 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001452
Victor Stinnerf1852262012-06-16 16:38:26 +02001453#ifdef Py_DEBUG
1454 if (!check_maxchar
1455 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1456 {
1457 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1458 Py_UCS4 ch;
1459 Py_ssize_t i;
1460 for (i=0; i < how_many; i++) {
1461 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1462 assert(ch <= to_maxchar);
1463 }
1464 }
1465#endif
1466
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001467 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001468 if (check_maxchar
1469 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1470 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001471 /* Writing Latin-1 characters into an ASCII string requires to
1472 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001473 Py_UCS4 max_char;
1474 max_char = ucs1lib_find_max_char(from_data,
1475 (Py_UCS1*)from_data + how_many);
1476 if (max_char >= 128)
1477 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001478 }
Christian Heimesf051e432016-09-13 20:22:02 +02001479 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001480 (char*)from_data + from_kind * from_start,
1481 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001482 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001483 else if (from_kind == PyUnicode_1BYTE_KIND
1484 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001485 {
1486 _PyUnicode_CONVERT_BYTES(
1487 Py_UCS1, Py_UCS2,
1488 PyUnicode_1BYTE_DATA(from) + from_start,
1489 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1490 PyUnicode_2BYTE_DATA(to) + to_start
1491 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001492 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001493 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001494 && to_kind == PyUnicode_4BYTE_KIND)
1495 {
1496 _PyUnicode_CONVERT_BYTES(
1497 Py_UCS1, Py_UCS4,
1498 PyUnicode_1BYTE_DATA(from) + from_start,
1499 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1500 PyUnicode_4BYTE_DATA(to) + to_start
1501 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001502 }
1503 else if (from_kind == PyUnicode_2BYTE_KIND
1504 && to_kind == PyUnicode_4BYTE_KIND)
1505 {
1506 _PyUnicode_CONVERT_BYTES(
1507 Py_UCS2, Py_UCS4,
1508 PyUnicode_2BYTE_DATA(from) + from_start,
1509 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1510 PyUnicode_4BYTE_DATA(to) + to_start
1511 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001512 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001513 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001514 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1515
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001516 if (!check_maxchar) {
1517 if (from_kind == PyUnicode_2BYTE_KIND
1518 && to_kind == PyUnicode_1BYTE_KIND)
1519 {
1520 _PyUnicode_CONVERT_BYTES(
1521 Py_UCS2, Py_UCS1,
1522 PyUnicode_2BYTE_DATA(from) + from_start,
1523 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1524 PyUnicode_1BYTE_DATA(to) + to_start
1525 );
1526 }
1527 else if (from_kind == PyUnicode_4BYTE_KIND
1528 && to_kind == PyUnicode_1BYTE_KIND)
1529 {
1530 _PyUnicode_CONVERT_BYTES(
1531 Py_UCS4, Py_UCS1,
1532 PyUnicode_4BYTE_DATA(from) + from_start,
1533 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1534 PyUnicode_1BYTE_DATA(to) + to_start
1535 );
1536 }
1537 else if (from_kind == PyUnicode_4BYTE_KIND
1538 && to_kind == PyUnicode_2BYTE_KIND)
1539 {
1540 _PyUnicode_CONVERT_BYTES(
1541 Py_UCS4, Py_UCS2,
1542 PyUnicode_4BYTE_DATA(from) + from_start,
1543 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1544 PyUnicode_2BYTE_DATA(to) + to_start
1545 );
1546 }
1547 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001548 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001549 }
1550 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001551 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001552 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001553 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001554 Py_ssize_t i;
1555
Victor Stinnera0702ab2011-09-29 14:14:38 +02001556 for (i=0; i < how_many; i++) {
1557 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001558 if (ch > to_maxchar)
1559 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001560 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1561 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001562 }
1563 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001564 return 0;
1565}
1566
Victor Stinnerd3f08822012-05-29 12:57:52 +02001567void
1568_PyUnicode_FastCopyCharacters(
1569 PyObject *to, Py_ssize_t to_start,
1570 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001571{
1572 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1573}
1574
1575Py_ssize_t
1576PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1577 PyObject *from, Py_ssize_t from_start,
1578 Py_ssize_t how_many)
1579{
1580 int err;
1581
1582 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1583 PyErr_BadInternalCall();
1584 return -1;
1585 }
1586
Benjamin Petersonbac79492012-01-14 13:34:47 -05001587 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001588 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001589 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001590 return -1;
1591
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001592 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001593 PyErr_SetString(PyExc_IndexError, "string index out of range");
1594 return -1;
1595 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001596 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001597 PyErr_SetString(PyExc_IndexError, "string index out of range");
1598 return -1;
1599 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001600 if (how_many < 0) {
1601 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1602 return -1;
1603 }
1604 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001605 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1606 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001607 "Cannot write %zi characters at %zi "
1608 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001609 how_many, to_start, PyUnicode_GET_LENGTH(to));
1610 return -1;
1611 }
1612
1613 if (how_many == 0)
1614 return 0;
1615
Victor Stinner488fa492011-12-12 00:01:39 +01001616 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001617 return -1;
1618
1619 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1620 if (err) {
1621 PyErr_Format(PyExc_SystemError,
1622 "Cannot copy %s characters "
1623 "into a string of %s characters",
1624 unicode_kind_name(from),
1625 unicode_kind_name(to));
1626 return -1;
1627 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001628 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001629}
1630
Victor Stinner17222162011-09-28 22:15:37 +02001631/* Find the maximum code point and count the number of surrogate pairs so a
1632 correct string length can be computed before converting a string to UCS4.
1633 This function counts single surrogates as a character and not as a pair.
1634
1635 Return 0 on success, or -1 on error. */
1636static int
1637find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1638 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001639{
1640 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001641 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642
Victor Stinnerc53be962011-10-02 21:33:54 +02001643 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001644 *num_surrogates = 0;
1645 *maxchar = 0;
1646
1647 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001648#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001649 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1650 && (iter+1) < end
1651 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1652 {
1653 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1654 ++(*num_surrogates);
1655 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001656 }
1657 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001659 {
1660 ch = *iter;
1661 iter++;
1662 }
1663 if (ch > *maxchar) {
1664 *maxchar = ch;
1665 if (*maxchar > MAX_UNICODE) {
1666 PyErr_Format(PyExc_ValueError,
1667 "character U+%x is not in range [U+0000; U+10ffff]",
1668 ch);
1669 return -1;
1670 }
1671 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672 }
1673 return 0;
1674}
1675
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001676int
1677_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001678{
1679 wchar_t *end;
1680 Py_UCS4 maxchar = 0;
1681 Py_ssize_t num_surrogates;
1682#if SIZEOF_WCHAR_T == 2
1683 Py_ssize_t length_wo_surrogates;
1684#endif
1685
Georg Brandl7597add2011-10-05 16:36:47 +02001686 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001687 strings were created using _PyObject_New() and where no canonical
1688 representation (the str field) has been set yet aka strings
1689 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001690 assert(_PyUnicode_CHECK(unicode));
1691 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001692 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001693 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001694 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001695 /* Actually, it should neither be interned nor be anything else: */
1696 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001697
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001699 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001700 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001701 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001702
1703 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001704 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1705 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706 PyErr_NoMemory();
1707 return -1;
1708 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001709 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710 _PyUnicode_WSTR(unicode), end,
1711 PyUnicode_1BYTE_DATA(unicode));
1712 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1713 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1714 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1715 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001716 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001717 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001718 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001719 }
1720 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001721 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001722 _PyUnicode_UTF8(unicode) = NULL;
1723 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001724 }
1725 PyObject_FREE(_PyUnicode_WSTR(unicode));
1726 _PyUnicode_WSTR(unicode) = NULL;
1727 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1728 }
1729 /* In this case we might have to convert down from 4-byte native
1730 wchar_t to 2-byte unicode. */
1731 else if (maxchar < 65536) {
1732 assert(num_surrogates == 0 &&
1733 "FindMaxCharAndNumSurrogatePairs() messed up");
1734
Victor Stinner506f5922011-09-28 22:34:18 +02001735#if SIZEOF_WCHAR_T == 2
1736 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001737 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001738 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1739 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1740 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001741 _PyUnicode_UTF8(unicode) = NULL;
1742 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001743#else
1744 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001745 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001746 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001747 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001748 PyErr_NoMemory();
1749 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 }
Victor Stinner506f5922011-09-28 22:34:18 +02001751 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1752 _PyUnicode_WSTR(unicode), end,
1753 PyUnicode_2BYTE_DATA(unicode));
1754 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1755 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1756 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001757 _PyUnicode_UTF8(unicode) = NULL;
1758 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001759 PyObject_FREE(_PyUnicode_WSTR(unicode));
1760 _PyUnicode_WSTR(unicode) = NULL;
1761 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1762#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763 }
1764 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1765 else {
1766#if SIZEOF_WCHAR_T == 2
1767 /* in case the native representation is 2-bytes, we need to allocate a
1768 new normalized 4-byte version. */
1769 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001770 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1771 PyErr_NoMemory();
1772 return -1;
1773 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001774 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1775 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776 PyErr_NoMemory();
1777 return -1;
1778 }
1779 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1780 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001781 _PyUnicode_UTF8(unicode) = NULL;
1782 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001783 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1784 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001785 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001786 PyObject_FREE(_PyUnicode_WSTR(unicode));
1787 _PyUnicode_WSTR(unicode) = NULL;
1788 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1789#else
1790 assert(num_surrogates == 0);
1791
Victor Stinnerc3c74152011-10-02 20:39:55 +02001792 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001794 _PyUnicode_UTF8(unicode) = NULL;
1795 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001796 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1797#endif
1798 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1799 }
1800 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001801 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802 return 0;
1803}
1804
Alexander Belopolsky40018472011-02-26 01:02:56 +00001805static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001806unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807{
Walter Dörwald16807132007-05-25 13:52:07 +00001808 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001809 case SSTATE_NOT_INTERNED:
1810 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001811
Benjamin Peterson29060642009-01-31 22:14:21 +00001812 case SSTATE_INTERNED_MORTAL:
1813 /* revive dead object temporarily for DelItem */
1814 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001815 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001816 Py_FatalError(
1817 "deletion of interned string failed");
1818 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001819
Benjamin Peterson29060642009-01-31 22:14:21 +00001820 case SSTATE_INTERNED_IMMORTAL:
1821 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001822 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001823
Benjamin Peterson29060642009-01-31 22:14:21 +00001824 default:
1825 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001826 }
1827
Victor Stinner03490912011-10-03 23:45:12 +02001828 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001829 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001830 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001831 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001832 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1833 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001835 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001836}
1837
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001838#ifdef Py_DEBUG
1839static int
1840unicode_is_singleton(PyObject *unicode)
1841{
1842 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1843 if (unicode == unicode_empty)
1844 return 1;
1845 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1846 {
1847 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1848 if (ch < 256 && unicode_latin1[ch] == unicode)
1849 return 1;
1850 }
1851 return 0;
1852}
1853#endif
1854
Alexander Belopolsky40018472011-02-26 01:02:56 +00001855static int
Victor Stinner488fa492011-12-12 00:01:39 +01001856unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001857{
Victor Stinner488fa492011-12-12 00:01:39 +01001858 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001859 if (Py_REFCNT(unicode) != 1)
1860 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001861 if (_PyUnicode_HASH(unicode) != -1)
1862 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001863 if (PyUnicode_CHECK_INTERNED(unicode))
1864 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001865 if (!PyUnicode_CheckExact(unicode))
1866 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001867#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001868 /* singleton refcount is greater than 1 */
1869 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001870#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001871 return 1;
1872}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001873
Victor Stinnerfe226c02011-10-03 03:52:20 +02001874static int
1875unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1876{
1877 PyObject *unicode;
1878 Py_ssize_t old_length;
1879
1880 assert(p_unicode != NULL);
1881 unicode = *p_unicode;
1882
1883 assert(unicode != NULL);
1884 assert(PyUnicode_Check(unicode));
1885 assert(0 <= length);
1886
Victor Stinner910337b2011-10-03 03:20:16 +02001887 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001888 old_length = PyUnicode_WSTR_LENGTH(unicode);
1889 else
1890 old_length = PyUnicode_GET_LENGTH(unicode);
1891 if (old_length == length)
1892 return 0;
1893
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001894 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001895 _Py_INCREF_UNICODE_EMPTY();
1896 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001897 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001898 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001899 return 0;
1900 }
1901
Victor Stinner488fa492011-12-12 00:01:39 +01001902 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001903 PyObject *copy = resize_copy(unicode, length);
1904 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001905 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001906 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001907 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001908 }
1909
Victor Stinnerfe226c02011-10-03 03:52:20 +02001910 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001911 PyObject *new_unicode = resize_compact(unicode, length);
1912 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001913 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001914 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001915 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001916 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001917 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001918}
1919
Alexander Belopolsky40018472011-02-26 01:02:56 +00001920int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001921PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001922{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001923 PyObject *unicode;
1924 if (p_unicode == NULL) {
1925 PyErr_BadInternalCall();
1926 return -1;
1927 }
1928 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001929 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001930 {
1931 PyErr_BadInternalCall();
1932 return -1;
1933 }
1934 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001935}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001936
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001937/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001938
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001939 WARNING: The function doesn't copy the terminating null character and
1940 doesn't check the maximum character (may write a latin1 character in an
1941 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001942static void
1943unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1944 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001945{
1946 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1947 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001948 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001949
1950 switch (kind) {
1951 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001952 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001953#ifdef Py_DEBUG
1954 if (PyUnicode_IS_ASCII(unicode)) {
1955 Py_UCS4 maxchar = ucs1lib_find_max_char(
1956 (const Py_UCS1*)str,
1957 (const Py_UCS1*)str + len);
1958 assert(maxchar < 128);
1959 }
1960#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001961 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001962 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001963 }
1964 case PyUnicode_2BYTE_KIND: {
1965 Py_UCS2 *start = (Py_UCS2 *)data + index;
1966 Py_UCS2 *ucs2 = start;
1967 assert(index <= PyUnicode_GET_LENGTH(unicode));
1968
Victor Stinner184252a2012-06-16 02:57:41 +02001969 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001970 *ucs2 = (Py_UCS2)*str;
1971
1972 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001973 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001974 }
1975 default: {
1976 Py_UCS4 *start = (Py_UCS4 *)data + index;
1977 Py_UCS4 *ucs4 = start;
1978 assert(kind == PyUnicode_4BYTE_KIND);
1979 assert(index <= PyUnicode_GET_LENGTH(unicode));
1980
Victor Stinner184252a2012-06-16 02:57:41 +02001981 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001982 *ucs4 = (Py_UCS4)*str;
1983
1984 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001985 }
1986 }
1987}
1988
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001989static PyObject*
1990get_latin1_char(unsigned char ch)
1991{
Victor Stinnera464fc12011-10-02 20:39:30 +02001992 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001993 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001994 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995 if (!unicode)
1996 return NULL;
1997 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001998 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999 unicode_latin1[ch] = unicode;
2000 }
2001 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002002 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002003}
2004
Victor Stinner985a82a2014-01-03 12:53:47 +01002005static PyObject*
2006unicode_char(Py_UCS4 ch)
2007{
2008 PyObject *unicode;
2009
2010 assert(ch <= MAX_UNICODE);
2011
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002012 if (ch < 256)
2013 return get_latin1_char(ch);
2014
Victor Stinner985a82a2014-01-03 12:53:47 +01002015 unicode = PyUnicode_New(1, ch);
2016 if (unicode == NULL)
2017 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002018
2019 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2020 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002021 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002022 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002023 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2024 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2025 }
2026 assert(_PyUnicode_CheckConsistency(unicode, 1));
2027 return unicode;
2028}
2029
Alexander Belopolsky40018472011-02-26 01:02:56 +00002030PyObject *
2031PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002032{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002033 if (u == NULL)
2034 return (PyObject*)_PyUnicode_New(size);
2035
2036 if (size < 0) {
2037 PyErr_BadInternalCall();
2038 return NULL;
2039 }
2040
2041 return PyUnicode_FromWideChar(u, size);
2042}
2043
2044PyObject *
2045PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2046{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002047 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002048 Py_UCS4 maxchar = 0;
2049 Py_ssize_t num_surrogates;
2050
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002051 if (u == NULL && size != 0) {
2052 PyErr_BadInternalCall();
2053 return NULL;
2054 }
2055
2056 if (size == -1) {
2057 size = wcslen(u);
2058 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002060 /* If the Unicode data is known at construction time, we can apply
2061 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002063 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002064 if (size == 0)
2065 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002067 /* Single character Unicode objects in the Latin-1 range are
2068 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002069 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002070 return get_latin1_char((unsigned char)*u);
2071
2072 /* If not empty and not single character, copy the Unicode data
2073 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002074 if (find_maxchar_surrogates(u, u + size,
2075 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002076 return NULL;
2077
Victor Stinner8faf8212011-12-08 22:14:11 +01002078 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002079 if (!unicode)
2080 return NULL;
2081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002082 switch (PyUnicode_KIND(unicode)) {
2083 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002084 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002085 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2086 break;
2087 case PyUnicode_2BYTE_KIND:
2088#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002089 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002090#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002091 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002092 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2093#endif
2094 break;
2095 case PyUnicode_4BYTE_KIND:
2096#if SIZEOF_WCHAR_T == 2
2097 /* This is the only case which has to process surrogates, thus
2098 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002099 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002100#else
2101 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002102 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002103#endif
2104 break;
2105 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002106 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002107 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002108
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002109 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002110}
2111
Alexander Belopolsky40018472011-02-26 01:02:56 +00002112PyObject *
2113PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002114{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002115 if (size < 0) {
2116 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002117 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002118 return NULL;
2119 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002120 if (u != NULL)
2121 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2122 else
2123 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002124}
2125
Alexander Belopolsky40018472011-02-26 01:02:56 +00002126PyObject *
2127PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002128{
2129 size_t size = strlen(u);
2130 if (size > PY_SSIZE_T_MAX) {
2131 PyErr_SetString(PyExc_OverflowError, "input too long");
2132 return NULL;
2133 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002134 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002135}
2136
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002137PyObject *
2138_PyUnicode_FromId(_Py_Identifier *id)
2139{
2140 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002141 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2142 strlen(id->string),
2143 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002144 if (!id->object)
2145 return NULL;
2146 PyUnicode_InternInPlace(&id->object);
2147 assert(!id->next);
2148 id->next = static_strings;
2149 static_strings = id;
2150 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002151 return id->object;
2152}
2153
2154void
2155_PyUnicode_ClearStaticStrings()
2156{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002157 _Py_Identifier *tmp, *s = static_strings;
2158 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002159 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002160 tmp = s->next;
2161 s->next = NULL;
2162 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002163 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002164 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002165}
2166
Benjamin Peterson0df54292012-03-26 14:50:32 -04002167/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002168
Victor Stinnerd3f08822012-05-29 12:57:52 +02002169PyObject*
2170_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002171{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002172 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002173 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002174 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002175#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002176 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002177#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002178 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002179 }
Victor Stinner785938e2011-12-11 20:09:03 +01002180 unicode = PyUnicode_New(size, 127);
2181 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002182 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002183 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2184 assert(_PyUnicode_CheckConsistency(unicode, 1));
2185 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002186}
2187
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002188static Py_UCS4
2189kind_maxchar_limit(unsigned int kind)
2190{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002191 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002192 case PyUnicode_1BYTE_KIND:
2193 return 0x80;
2194 case PyUnicode_2BYTE_KIND:
2195 return 0x100;
2196 case PyUnicode_4BYTE_KIND:
2197 return 0x10000;
2198 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002199 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002200 }
2201}
2202
Victor Stinner702c7342011-10-05 13:50:52 +02002203static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002204_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002205{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002206 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002207 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002208
Serhiy Storchaka678db842013-01-26 12:16:36 +02002209 if (size == 0)
2210 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002211 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002212 if (size == 1)
2213 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002214
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002215 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002216 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002217 if (!res)
2218 return NULL;
2219 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002220 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002221 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002222}
2223
Victor Stinnere57b1c02011-09-28 22:20:48 +02002224static PyObject*
2225_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226{
2227 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002228 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002229
Serhiy Storchaka678db842013-01-26 12:16:36 +02002230 if (size == 0)
2231 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002232 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002233 if (size == 1)
2234 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002235
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002236 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002237 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238 if (!res)
2239 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002240 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002241 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002242 else {
2243 _PyUnicode_CONVERT_BYTES(
2244 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2245 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002246 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247 return res;
2248}
2249
Victor Stinnere57b1c02011-09-28 22:20:48 +02002250static PyObject*
2251_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252{
2253 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002254 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002255
Serhiy Storchaka678db842013-01-26 12:16:36 +02002256 if (size == 0)
2257 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002258 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002259 if (size == 1)
2260 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002261
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002262 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002263 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264 if (!res)
2265 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002266 if (max_char < 256)
2267 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2268 PyUnicode_1BYTE_DATA(res));
2269 else if (max_char < 0x10000)
2270 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2271 PyUnicode_2BYTE_DATA(res));
2272 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002273 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002274 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002275 return res;
2276}
2277
2278PyObject*
2279PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2280{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002281 if (size < 0) {
2282 PyErr_SetString(PyExc_ValueError, "size must be positive");
2283 return NULL;
2284 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002285 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002286 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002287 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002288 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002289 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002290 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002291 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002292 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002293 PyErr_SetString(PyExc_SystemError, "invalid kind");
2294 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002295 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002296}
2297
Victor Stinnerece58de2012-04-23 23:36:38 +02002298Py_UCS4
2299_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2300{
2301 enum PyUnicode_Kind kind;
2302 void *startptr, *endptr;
2303
2304 assert(PyUnicode_IS_READY(unicode));
2305 assert(0 <= start);
2306 assert(end <= PyUnicode_GET_LENGTH(unicode));
2307 assert(start <= end);
2308
2309 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2310 return PyUnicode_MAX_CHAR_VALUE(unicode);
2311
2312 if (start == end)
2313 return 127;
2314
Victor Stinner94d558b2012-04-27 22:26:58 +02002315 if (PyUnicode_IS_ASCII(unicode))
2316 return 127;
2317
Victor Stinnerece58de2012-04-23 23:36:38 +02002318 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002319 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002320 endptr = (char *)startptr + end * kind;
2321 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002322 switch(kind) {
2323 case PyUnicode_1BYTE_KIND:
2324 return ucs1lib_find_max_char(startptr, endptr);
2325 case PyUnicode_2BYTE_KIND:
2326 return ucs2lib_find_max_char(startptr, endptr);
2327 case PyUnicode_4BYTE_KIND:
2328 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002329 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002330 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002331 }
2332}
2333
Victor Stinner25a4b292011-10-06 12:31:55 +02002334/* Ensure that a string uses the most efficient storage, if it is not the
2335 case: create a new string with of the right kind. Write NULL into *p_unicode
2336 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002337static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002338unicode_adjust_maxchar(PyObject **p_unicode)
2339{
2340 PyObject *unicode, *copy;
2341 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002342 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002343 unsigned int kind;
2344
2345 assert(p_unicode != NULL);
2346 unicode = *p_unicode;
2347 assert(PyUnicode_IS_READY(unicode));
2348 if (PyUnicode_IS_ASCII(unicode))
2349 return;
2350
2351 len = PyUnicode_GET_LENGTH(unicode);
2352 kind = PyUnicode_KIND(unicode);
2353 if (kind == PyUnicode_1BYTE_KIND) {
2354 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002355 max_char = ucs1lib_find_max_char(u, u + len);
2356 if (max_char >= 128)
2357 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002358 }
2359 else if (kind == PyUnicode_2BYTE_KIND) {
2360 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002361 max_char = ucs2lib_find_max_char(u, u + len);
2362 if (max_char >= 256)
2363 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002364 }
2365 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002366 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002367 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002368 max_char = ucs4lib_find_max_char(u, u + len);
2369 if (max_char >= 0x10000)
2370 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002371 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002372 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002373 if (copy != NULL)
2374 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002375 Py_DECREF(unicode);
2376 *p_unicode = copy;
2377}
2378
Victor Stinner034f6cf2011-09-30 02:26:44 +02002379PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002380_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002381{
Victor Stinner87af4f22011-11-21 23:03:47 +01002382 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002383 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002384
Victor Stinner034f6cf2011-09-30 02:26:44 +02002385 if (!PyUnicode_Check(unicode)) {
2386 PyErr_BadInternalCall();
2387 return NULL;
2388 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002389 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002390 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002391
Victor Stinner87af4f22011-11-21 23:03:47 +01002392 length = PyUnicode_GET_LENGTH(unicode);
2393 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002394 if (!copy)
2395 return NULL;
2396 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2397
Christian Heimesf051e432016-09-13 20:22:02 +02002398 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002399 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002400 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002401 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002402}
2403
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002404
Victor Stinnerbc603d12011-10-02 01:00:40 +02002405/* Widen Unicode objects to larger buffers. Don't write terminating null
2406 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002407
2408void*
2409_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2410{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002411 Py_ssize_t len;
2412 void *result;
2413 unsigned int skind;
2414
Benjamin Petersonbac79492012-01-14 13:34:47 -05002415 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002416 return NULL;
2417
2418 len = PyUnicode_GET_LENGTH(s);
2419 skind = PyUnicode_KIND(s);
2420 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002421 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002422 return NULL;
2423 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002424 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002425 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002426 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002427 if (!result)
2428 return PyErr_NoMemory();
2429 assert(skind == PyUnicode_1BYTE_KIND);
2430 _PyUnicode_CONVERT_BYTES(
2431 Py_UCS1, Py_UCS2,
2432 PyUnicode_1BYTE_DATA(s),
2433 PyUnicode_1BYTE_DATA(s) + len,
2434 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002435 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002436 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002437 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002438 if (!result)
2439 return PyErr_NoMemory();
2440 if (skind == PyUnicode_2BYTE_KIND) {
2441 _PyUnicode_CONVERT_BYTES(
2442 Py_UCS2, Py_UCS4,
2443 PyUnicode_2BYTE_DATA(s),
2444 PyUnicode_2BYTE_DATA(s) + len,
2445 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002446 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002447 else {
2448 assert(skind == PyUnicode_1BYTE_KIND);
2449 _PyUnicode_CONVERT_BYTES(
2450 Py_UCS1, Py_UCS4,
2451 PyUnicode_1BYTE_DATA(s),
2452 PyUnicode_1BYTE_DATA(s) + len,
2453 result);
2454 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002455 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002456 default:
2457 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002458 }
Victor Stinner01698042011-10-04 00:04:26 +02002459 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002460 return NULL;
2461}
2462
2463static Py_UCS4*
2464as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2465 int copy_null)
2466{
2467 int kind;
2468 void *data;
2469 Py_ssize_t len, targetlen;
2470 if (PyUnicode_READY(string) == -1)
2471 return NULL;
2472 kind = PyUnicode_KIND(string);
2473 data = PyUnicode_DATA(string);
2474 len = PyUnicode_GET_LENGTH(string);
2475 targetlen = len;
2476 if (copy_null)
2477 targetlen++;
2478 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002479 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002480 if (!target) {
2481 PyErr_NoMemory();
2482 return NULL;
2483 }
2484 }
2485 else {
2486 if (targetsize < targetlen) {
2487 PyErr_Format(PyExc_SystemError,
2488 "string is longer than the buffer");
2489 if (copy_null && 0 < targetsize)
2490 target[0] = 0;
2491 return NULL;
2492 }
2493 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002494 if (kind == PyUnicode_1BYTE_KIND) {
2495 Py_UCS1 *start = (Py_UCS1 *) data;
2496 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002497 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002498 else if (kind == PyUnicode_2BYTE_KIND) {
2499 Py_UCS2 *start = (Py_UCS2 *) data;
2500 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2501 }
2502 else {
2503 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002504 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002505 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002506 if (copy_null)
2507 target[len] = 0;
2508 return target;
2509}
2510
2511Py_UCS4*
2512PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2513 int copy_null)
2514{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002515 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002516 PyErr_BadInternalCall();
2517 return NULL;
2518 }
2519 return as_ucs4(string, target, targetsize, copy_null);
2520}
2521
2522Py_UCS4*
2523PyUnicode_AsUCS4Copy(PyObject *string)
2524{
2525 return as_ucs4(string, NULL, 0, 1);
2526}
2527
Victor Stinner15a11362012-10-06 23:48:20 +02002528/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002529 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2530 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2531#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002532
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002533static int
2534unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2535 Py_ssize_t width, Py_ssize_t precision)
2536{
2537 Py_ssize_t length, fill, arglen;
2538 Py_UCS4 maxchar;
2539
2540 if (PyUnicode_READY(str) == -1)
2541 return -1;
2542
2543 length = PyUnicode_GET_LENGTH(str);
2544 if ((precision == -1 || precision >= length)
2545 && width <= length)
2546 return _PyUnicodeWriter_WriteStr(writer, str);
2547
2548 if (precision != -1)
2549 length = Py_MIN(precision, length);
2550
2551 arglen = Py_MAX(length, width);
2552 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2553 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2554 else
2555 maxchar = writer->maxchar;
2556
2557 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2558 return -1;
2559
2560 if (width > length) {
2561 fill = width - length;
2562 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2563 return -1;
2564 writer->pos += fill;
2565 }
2566
2567 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2568 str, 0, length);
2569 writer->pos += length;
2570 return 0;
2571}
2572
2573static int
2574unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2575 Py_ssize_t width, Py_ssize_t precision)
2576{
2577 /* UTF-8 */
2578 Py_ssize_t length;
2579 PyObject *unicode;
2580 int res;
2581
2582 length = strlen(str);
2583 if (precision != -1)
2584 length = Py_MIN(length, precision);
2585 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2586 if (unicode == NULL)
2587 return -1;
2588
2589 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2590 Py_DECREF(unicode);
2591 return res;
2592}
2593
Victor Stinner96865452011-03-01 23:44:09 +00002594static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002595unicode_fromformat_arg(_PyUnicodeWriter *writer,
2596 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002597{
Victor Stinnere215d962012-10-06 23:03:36 +02002598 const char *p;
2599 Py_ssize_t len;
2600 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002601 Py_ssize_t width;
2602 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002603 int longflag;
2604 int longlongflag;
2605 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002606 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002607
2608 p = f;
2609 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002610 zeropad = 0;
2611 if (*f == '0') {
2612 zeropad = 1;
2613 f++;
2614 }
Victor Stinner96865452011-03-01 23:44:09 +00002615
2616 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002617 width = -1;
2618 if (Py_ISDIGIT((unsigned)*f)) {
2619 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002620 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002621 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002622 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002623 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002624 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002625 return NULL;
2626 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002627 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002628 f++;
2629 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002630 }
2631 precision = -1;
2632 if (*f == '.') {
2633 f++;
2634 if (Py_ISDIGIT((unsigned)*f)) {
2635 precision = (*f - '0');
2636 f++;
2637 while (Py_ISDIGIT((unsigned)*f)) {
2638 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2639 PyErr_SetString(PyExc_ValueError,
2640 "precision too big");
2641 return NULL;
2642 }
2643 precision = (precision * 10) + (*f - '0');
2644 f++;
2645 }
2646 }
Victor Stinner96865452011-03-01 23:44:09 +00002647 if (*f == '%') {
2648 /* "%.3%s" => f points to "3" */
2649 f--;
2650 }
2651 }
2652 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002653 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002654 f--;
2655 }
Victor Stinner96865452011-03-01 23:44:09 +00002656
2657 /* Handle %ld, %lu, %lld and %llu. */
2658 longflag = 0;
2659 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002660 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002661 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002662 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002663 longflag = 1;
2664 ++f;
2665 }
Victor Stinner96865452011-03-01 23:44:09 +00002666 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002667 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002668 longlongflag = 1;
2669 f += 2;
2670 }
Victor Stinner96865452011-03-01 23:44:09 +00002671 }
2672 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002673 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002674 size_tflag = 1;
2675 ++f;
2676 }
Victor Stinnere215d962012-10-06 23:03:36 +02002677
2678 if (f[1] == '\0')
2679 writer->overallocate = 0;
2680
2681 switch (*f) {
2682 case 'c':
2683 {
2684 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002685 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002686 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002687 "character argument not in range(0x110000)");
2688 return NULL;
2689 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002690 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002691 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002692 break;
2693 }
2694
2695 case 'i':
2696 case 'd':
2697 case 'u':
2698 case 'x':
2699 {
2700 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002701 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002702 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002703
2704 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002705 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002706 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002707 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002708 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002709 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002710 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002711 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002712 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002713 va_arg(*vargs, size_t));
2714 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002715 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002716 va_arg(*vargs, unsigned int));
2717 }
2718 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002719 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002720 }
2721 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002722 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002723 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002724 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002725 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002726 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002727 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002728 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002729 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002730 va_arg(*vargs, Py_ssize_t));
2731 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002732 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002733 va_arg(*vargs, int));
2734 }
2735 assert(len >= 0);
2736
Victor Stinnere215d962012-10-06 23:03:36 +02002737 if (precision < len)
2738 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002739
2740 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002741 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2742 return NULL;
2743
Victor Stinnere215d962012-10-06 23:03:36 +02002744 if (width > precision) {
2745 Py_UCS4 fillchar;
2746 fill = width - precision;
2747 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002748 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2749 return NULL;
2750 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002751 }
Victor Stinner15a11362012-10-06 23:48:20 +02002752 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002753 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002754 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2755 return NULL;
2756 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002757 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002758
Victor Stinner4a587072013-11-19 12:54:53 +01002759 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2760 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002761 break;
2762 }
2763
2764 case 'p':
2765 {
2766 char number[MAX_LONG_LONG_CHARS];
2767
2768 len = sprintf(number, "%p", va_arg(*vargs, void*));
2769 assert(len >= 0);
2770
2771 /* %p is ill-defined: ensure leading 0x. */
2772 if (number[1] == 'X')
2773 number[1] = 'x';
2774 else if (number[1] != 'x') {
2775 memmove(number + 2, number,
2776 strlen(number) + 1);
2777 number[0] = '0';
2778 number[1] = 'x';
2779 len += 2;
2780 }
2781
Victor Stinner4a587072013-11-19 12:54:53 +01002782 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002783 return NULL;
2784 break;
2785 }
2786
2787 case 's':
2788 {
2789 /* UTF-8 */
2790 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002791 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002792 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002793 break;
2794 }
2795
2796 case 'U':
2797 {
2798 PyObject *obj = va_arg(*vargs, PyObject *);
2799 assert(obj && _PyUnicode_CHECK(obj));
2800
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002801 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002802 return NULL;
2803 break;
2804 }
2805
2806 case 'V':
2807 {
2808 PyObject *obj = va_arg(*vargs, PyObject *);
2809 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002810 if (obj) {
2811 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002812 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002813 return NULL;
2814 }
2815 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002816 assert(str != NULL);
2817 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002818 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002819 }
2820 break;
2821 }
2822
2823 case 'S':
2824 {
2825 PyObject *obj = va_arg(*vargs, PyObject *);
2826 PyObject *str;
2827 assert(obj);
2828 str = PyObject_Str(obj);
2829 if (!str)
2830 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002831 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002832 Py_DECREF(str);
2833 return NULL;
2834 }
2835 Py_DECREF(str);
2836 break;
2837 }
2838
2839 case 'R':
2840 {
2841 PyObject *obj = va_arg(*vargs, PyObject *);
2842 PyObject *repr;
2843 assert(obj);
2844 repr = PyObject_Repr(obj);
2845 if (!repr)
2846 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002847 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002848 Py_DECREF(repr);
2849 return NULL;
2850 }
2851 Py_DECREF(repr);
2852 break;
2853 }
2854
2855 case 'A':
2856 {
2857 PyObject *obj = va_arg(*vargs, PyObject *);
2858 PyObject *ascii;
2859 assert(obj);
2860 ascii = PyObject_ASCII(obj);
2861 if (!ascii)
2862 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002863 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002864 Py_DECREF(ascii);
2865 return NULL;
2866 }
2867 Py_DECREF(ascii);
2868 break;
2869 }
2870
2871 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002872 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002873 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002874 break;
2875
2876 default:
2877 /* if we stumble upon an unknown formatting code, copy the rest
2878 of the format string to the output string. (we cannot just
2879 skip the code, since there's no way to know what's in the
2880 argument list) */
2881 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002882 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002883 return NULL;
2884 f = p+len;
2885 return f;
2886 }
2887
2888 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002889 return f;
2890}
2891
Walter Dörwaldd2034312007-05-18 16:29:38 +00002892PyObject *
2893PyUnicode_FromFormatV(const char *format, va_list vargs)
2894{
Victor Stinnere215d962012-10-06 23:03:36 +02002895 va_list vargs2;
2896 const char *f;
2897 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002898
Victor Stinner8f674cc2013-04-17 23:02:17 +02002899 _PyUnicodeWriter_Init(&writer);
2900 writer.min_length = strlen(format) + 100;
2901 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002902
Benjamin Peterson0c212142016-09-20 20:39:33 -07002903 // Copy varags to be able to pass a reference to a subfunction.
2904 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002905
2906 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002907 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002908 f = unicode_fromformat_arg(&writer, f, &vargs2);
2909 if (f == NULL)
2910 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002911 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002912 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002913 const char *p;
2914 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002915
Victor Stinnere215d962012-10-06 23:03:36 +02002916 p = f;
2917 do
2918 {
2919 if ((unsigned char)*p > 127) {
2920 PyErr_Format(PyExc_ValueError,
2921 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2922 "string, got a non-ASCII byte: 0x%02x",
2923 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002924 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002925 }
2926 p++;
2927 }
2928 while (*p != '\0' && *p != '%');
2929 len = p - f;
2930
2931 if (*p == '\0')
2932 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002933
2934 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002935 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002936
2937 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002938 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002939 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002940 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002941 return _PyUnicodeWriter_Finish(&writer);
2942
2943 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002944 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002945 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002946 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002947}
2948
Walter Dörwaldd2034312007-05-18 16:29:38 +00002949PyObject *
2950PyUnicode_FromFormat(const char *format, ...)
2951{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002952 PyObject* ret;
2953 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002954
2955#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002956 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002957#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002958 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002959#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002960 ret = PyUnicode_FromFormatV(format, vargs);
2961 va_end(vargs);
2962 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002963}
2964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002965#ifdef HAVE_WCHAR_H
2966
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002967/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00002968
Victor Stinnerd88d9832011-09-06 02:00:05 +02002969 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002970 character) required to convert the unicode object. Ignore size argument.
2971
Victor Stinnerd88d9832011-09-06 02:00:05 +02002972 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002973 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002974 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002975Py_ssize_t
2976PyUnicode_AsWideChar(PyObject *unicode,
2977 wchar_t *w,
2978 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00002979{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002980 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002981 const wchar_t *wstr;
2982
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002983 if (unicode == NULL) {
2984 PyErr_BadInternalCall();
2985 return -1;
2986 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002987 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002988 if (wstr == NULL)
2989 return -1;
2990
Victor Stinner5593d8a2010-10-02 11:11:27 +00002991 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002992 if (size > res)
2993 size = res + 1;
2994 else
2995 res = size;
Christian Heimesf051e432016-09-13 20:22:02 +02002996 memcpy(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002997 return res;
2998 }
2999 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003000 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00003001}
3002
Victor Stinner137c34c2010-09-29 10:25:54 +00003003wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003004PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003005 Py_ssize_t *size)
3006{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003007 const wchar_t *wstr;
3008 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003009 Py_ssize_t buflen;
3010
3011 if (unicode == NULL) {
3012 PyErr_BadInternalCall();
3013 return NULL;
3014 }
3015
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003016 wstr = PyUnicode_AsUnicodeAndSize(unicode, &buflen);
3017 if (wstr == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003018 return NULL;
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003019 }
3020 if (size == NULL && wcslen(wstr) != (size_t)buflen) {
3021 PyErr_SetString(PyExc_ValueError,
3022 "embedded null character");
3023 return NULL;
3024 }
3025
3026 buffer = PyMem_NEW(wchar_t, buflen + 1);
Victor Stinner137c34c2010-09-29 10:25:54 +00003027 if (buffer == NULL) {
3028 PyErr_NoMemory();
3029 return NULL;
3030 }
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003031 memcpy(buffer, wstr, (buflen + 1) * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00003032 if (size != NULL)
3033 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003034 return buffer;
3035}
3036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003037#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038
Alexander Belopolsky40018472011-02-26 01:02:56 +00003039PyObject *
3040PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003041{
Victor Stinner8faf8212011-12-08 22:14:11 +01003042 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003043 PyErr_SetString(PyExc_ValueError,
3044 "chr() arg not in range(0x110000)");
3045 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003046 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003047
Victor Stinner985a82a2014-01-03 12:53:47 +01003048 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003049}
3050
Alexander Belopolsky40018472011-02-26 01:02:56 +00003051PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003052PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003054 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003055 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003056 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003057 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003058 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003059 Py_INCREF(obj);
3060 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003061 }
3062 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003063 /* For a Unicode subtype that's not a Unicode object,
3064 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003065 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003066 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003067 PyErr_Format(PyExc_TypeError,
3068 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003069 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003070 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003071}
3072
Alexander Belopolsky40018472011-02-26 01:02:56 +00003073PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003074PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003075 const char *encoding,
3076 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003077{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003078 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003079 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003080
Guido van Rossumd57fd912000-03-10 22:53:23 +00003081 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003082 PyErr_BadInternalCall();
3083 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003085
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003086 /* Decoding bytes objects is the most common case and should be fast */
3087 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003088 if (PyBytes_GET_SIZE(obj) == 0)
3089 _Py_RETURN_UNICODE_EMPTY();
3090 v = PyUnicode_Decode(
3091 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3092 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003093 return v;
3094 }
3095
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003096 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003097 PyErr_SetString(PyExc_TypeError,
3098 "decoding str is not supported");
3099 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003100 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003101
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003102 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3103 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3104 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003105 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003106 Py_TYPE(obj)->tp_name);
3107 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003108 }
Tim Petersced69f82003-09-16 20:30:58 +00003109
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003110 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003111 PyBuffer_Release(&buffer);
3112 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003114
Serhiy Storchaka05997252013-01-26 12:14:02 +02003115 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003116 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003117 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003118}
3119
Victor Stinnerebe17e02016-10-12 13:57:45 +02003120/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3121 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3122 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003123int
3124_Py_normalize_encoding(const char *encoding,
3125 char *lower,
3126 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003127{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003128 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003129 char *l;
3130 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003131 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003132
Victor Stinner942889a2016-09-05 15:40:10 -07003133 assert(encoding != NULL);
3134
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003135 e = encoding;
3136 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003137 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003138 punct = 0;
3139 while (1) {
3140 char c = *e;
3141 if (c == 0) {
3142 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003143 }
Victor Stinner942889a2016-09-05 15:40:10 -07003144
3145 if (Py_ISALNUM(c) || c == '.') {
3146 if (punct && l != lower) {
3147 if (l == l_end) {
3148 return 0;
3149 }
3150 *l++ = '_';
3151 }
3152 punct = 0;
3153
3154 if (l == l_end) {
3155 return 0;
3156 }
3157 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003158 }
3159 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003160 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003161 }
Victor Stinner942889a2016-09-05 15:40:10 -07003162
3163 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003164 }
3165 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003166 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003167}
3168
Alexander Belopolsky40018472011-02-26 01:02:56 +00003169PyObject *
3170PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003171 Py_ssize_t size,
3172 const char *encoding,
3173 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003174{
3175 PyObject *buffer = NULL, *unicode;
3176 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003177 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3178
3179 if (encoding == NULL) {
3180 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3181 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003182
Fred Drakee4315f52000-05-09 19:53:39 +00003183 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003184 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3185 char *lower = buflower;
3186
3187 /* Fast paths */
3188 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3189 lower += 3;
3190 if (*lower == '_') {
3191 /* Match "utf8" and "utf_8" */
3192 lower++;
3193 }
3194
3195 if (lower[0] == '8' && lower[1] == 0) {
3196 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3197 }
3198 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3199 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3200 }
3201 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3202 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3203 }
3204 }
3205 else {
3206 if (strcmp(lower, "ascii") == 0
3207 || strcmp(lower, "us_ascii") == 0) {
3208 return PyUnicode_DecodeASCII(s, size, errors);
3209 }
Steve Dowercc16be82016-09-08 10:35:16 -07003210 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003211 else if (strcmp(lower, "mbcs") == 0) {
3212 return PyUnicode_DecodeMBCS(s, size, errors);
3213 }
3214 #endif
3215 else if (strcmp(lower, "latin1") == 0
3216 || strcmp(lower, "latin_1") == 0
3217 || strcmp(lower, "iso_8859_1") == 0
3218 || strcmp(lower, "iso8859_1") == 0) {
3219 return PyUnicode_DecodeLatin1(s, size, errors);
3220 }
3221 }
Victor Stinner37296e82010-06-10 13:36:23 +00003222 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223
3224 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003225 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003226 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003227 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003228 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003229 if (buffer == NULL)
3230 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003231 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003232 if (unicode == NULL)
3233 goto onError;
3234 if (!PyUnicode_Check(unicode)) {
3235 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003236 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3237 "use codecs.decode() to decode to arbitrary types",
3238 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003239 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003240 Py_DECREF(unicode);
3241 goto onError;
3242 }
3243 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003244 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003245
Benjamin Peterson29060642009-01-31 22:14:21 +00003246 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247 Py_XDECREF(buffer);
3248 return NULL;
3249}
3250
Alexander Belopolsky40018472011-02-26 01:02:56 +00003251PyObject *
3252PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003253 const char *encoding,
3254 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003255{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003256 if (!PyUnicode_Check(unicode)) {
3257 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003258 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003259 }
3260
Serhiy Storchaka00939072016-10-27 21:05:49 +03003261 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3262 "PyUnicode_AsDecodedObject() is deprecated; "
3263 "use PyCodec_Decode() to decode from str", 1) < 0)
3264 return NULL;
3265
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003266 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003267 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003268
3269 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003270 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003271}
3272
Alexander Belopolsky40018472011-02-26 01:02:56 +00003273PyObject *
3274PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003275 const char *encoding,
3276 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003277{
3278 PyObject *v;
3279
3280 if (!PyUnicode_Check(unicode)) {
3281 PyErr_BadArgument();
3282 goto onError;
3283 }
3284
Serhiy Storchaka00939072016-10-27 21:05:49 +03003285 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3286 "PyUnicode_AsDecodedUnicode() is deprecated; "
3287 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3288 return NULL;
3289
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003290 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003291 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003292
3293 /* Decode via the codec registry */
3294 v = PyCodec_Decode(unicode, encoding, errors);
3295 if (v == NULL)
3296 goto onError;
3297 if (!PyUnicode_Check(v)) {
3298 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003299 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3300 "use codecs.decode() to decode to arbitrary types",
3301 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003302 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003303 Py_DECREF(v);
3304 goto onError;
3305 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003306 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003307
Benjamin Peterson29060642009-01-31 22:14:21 +00003308 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003309 return NULL;
3310}
3311
Alexander Belopolsky40018472011-02-26 01:02:56 +00003312PyObject *
3313PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003314 Py_ssize_t size,
3315 const char *encoding,
3316 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003317{
3318 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003319
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003320 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003321 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003322 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003323 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3324 Py_DECREF(unicode);
3325 return v;
3326}
3327
Alexander Belopolsky40018472011-02-26 01:02:56 +00003328PyObject *
3329PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003330 const char *encoding,
3331 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003332{
3333 PyObject *v;
3334
3335 if (!PyUnicode_Check(unicode)) {
3336 PyErr_BadArgument();
3337 goto onError;
3338 }
3339
Serhiy Storchaka00939072016-10-27 21:05:49 +03003340 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3341 "PyUnicode_AsEncodedObject() is deprecated; "
3342 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3343 "or PyCodec_Encode() for generic encoding", 1) < 0)
3344 return NULL;
3345
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003346 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003347 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003348
3349 /* Encode via the codec registry */
3350 v = PyCodec_Encode(unicode, encoding, errors);
3351 if (v == NULL)
3352 goto onError;
3353 return v;
3354
Benjamin Peterson29060642009-01-31 22:14:21 +00003355 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003356 return NULL;
3357}
3358
Victor Stinner1b579672011-12-17 05:47:23 +01003359static int
3360locale_error_handler(const char *errors, int *surrogateescape)
3361{
Victor Stinner50149202015-09-22 00:26:54 +02003362 _Py_error_handler error_handler = get_error_handler(errors);
3363 switch (error_handler)
3364 {
3365 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003366 *surrogateescape = 0;
3367 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003368 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003369 *surrogateescape = 1;
3370 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003371 default:
3372 PyErr_Format(PyExc_ValueError,
3373 "only 'strict' and 'surrogateescape' error handlers "
3374 "are supported, not '%s'",
3375 errors);
3376 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003377 }
Victor Stinner1b579672011-12-17 05:47:23 +01003378}
3379
Victor Stinner2cba6b82018-01-10 22:46:15 +01003380static PyObject *
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003381unicode_encode_locale(PyObject *unicode, const char *errors,
3382 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003383{
Victor Stinner1b579672011-12-17 05:47:23 +01003384 int surrogateescape;
Victor Stinner1b579672011-12-17 05:47:23 +01003385 if (locale_error_handler(errors, &surrogateescape) < 0)
3386 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003387
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003388 Py_ssize_t wlen;
3389 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3390 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003391 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003392 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003393
Victor Stinner85ab9742018-11-28 12:42:40 +01003394 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003395 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner85ab9742018-11-28 12:42:40 +01003396 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003397 return NULL;
3398 }
3399
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003400 char *str;
3401 size_t error_pos;
3402 const char *reason;
3403 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3404 current_locale, surrogateescape);
Victor Stinner85ab9742018-11-28 12:42:40 +01003405 PyMem_Free(wstr);
3406
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003407 if (res != 0) {
3408 if (res == -2) {
3409 PyObject *exc;
3410 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3411 "locale", unicode,
3412 (Py_ssize_t)error_pos,
3413 (Py_ssize_t)(error_pos+1),
3414 reason);
3415 if (exc != NULL) {
3416 PyCodec_StrictErrors(exc);
3417 Py_DECREF(exc);
3418 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003419 }
3420 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003421 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003422 }
Victor Stinner85ab9742018-11-28 12:42:40 +01003423 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003424 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003425
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003426 PyObject *bytes = PyBytes_FromString(str);
3427 PyMem_RawFree(str);
3428 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003429}
3430
Victor Stinnerad158722010-10-27 00:25:46 +00003431PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003432PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3433{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003434 return unicode_encode_locale(unicode, errors, 1);
3435}
3436
3437PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003438PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003439{
Steve Dowercc16be82016-09-08 10:35:16 -07003440#if defined(__APPLE__)
3441 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerad158722010-10-27 00:25:46 +00003442#else
Victor Stinner793b5312011-04-27 00:24:21 +02003443 PyInterpreterState *interp = PyThreadState_GET()->interp;
3444 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3445 cannot use it to encode and decode filenames before it is loaded. Load
3446 the Python codec requires to encode at least its own filename. Use the C
3447 version of the locale codec until the codec registry is initialized and
3448 the Python codec is loaded.
3449
3450 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3451 cannot only rely on it: check also interp->fscodec_initialized for
3452 subinterpreters. */
3453 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003454 return PyUnicode_AsEncodedString(unicode,
3455 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003456 Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003457 }
3458 else {
Victor Stinner2cba6b82018-01-10 22:46:15 +01003459 return unicode_encode_locale(unicode,
3460 Py_FileSystemDefaultEncodeErrors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003461 }
Victor Stinnerad158722010-10-27 00:25:46 +00003462#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003463}
3464
Alexander Belopolsky40018472011-02-26 01:02:56 +00003465PyObject *
3466PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003467 const char *encoding,
3468 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469{
3470 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003471 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003472
Guido van Rossumd57fd912000-03-10 22:53:23 +00003473 if (!PyUnicode_Check(unicode)) {
3474 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003475 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003476 }
Fred Drakee4315f52000-05-09 19:53:39 +00003477
Victor Stinner942889a2016-09-05 15:40:10 -07003478 if (encoding == NULL) {
3479 return _PyUnicode_AsUTF8String(unicode, errors);
3480 }
3481
Fred Drakee4315f52000-05-09 19:53:39 +00003482 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003483 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3484 char *lower = buflower;
3485
3486 /* Fast paths */
3487 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3488 lower += 3;
3489 if (*lower == '_') {
3490 /* Match "utf8" and "utf_8" */
3491 lower++;
3492 }
3493
3494 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003495 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003496 }
3497 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3498 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3499 }
3500 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3501 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3502 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003503 }
Victor Stinner942889a2016-09-05 15:40:10 -07003504 else {
3505 if (strcmp(lower, "ascii") == 0
3506 || strcmp(lower, "us_ascii") == 0) {
3507 return _PyUnicode_AsASCIIString(unicode, errors);
3508 }
Steve Dowercc16be82016-09-08 10:35:16 -07003509#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003510 else if (strcmp(lower, "mbcs") == 0) {
3511 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3512 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003513#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003514 else if (strcmp(lower, "latin1") == 0 ||
3515 strcmp(lower, "latin_1") == 0 ||
3516 strcmp(lower, "iso_8859_1") == 0 ||
3517 strcmp(lower, "iso8859_1") == 0) {
3518 return _PyUnicode_AsLatin1String(unicode, errors);
3519 }
3520 }
Victor Stinner37296e82010-06-10 13:36:23 +00003521 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003522
3523 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003524 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003525 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003526 return NULL;
3527
3528 /* The normal path */
3529 if (PyBytes_Check(v))
3530 return v;
3531
3532 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003533 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003534 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003535 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003536
3537 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003538 "encoder %s returned bytearray instead of bytes; "
3539 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003540 encoding);
3541 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003542 Py_DECREF(v);
3543 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003544 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003545
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003546 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3547 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003548 Py_DECREF(v);
3549 return b;
3550 }
3551
3552 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003553 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3554 "use codecs.encode() to encode to arbitrary types",
3555 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003556 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003557 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003558 return NULL;
3559}
3560
Alexander Belopolsky40018472011-02-26 01:02:56 +00003561PyObject *
3562PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003563 const char *encoding,
3564 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003565{
3566 PyObject *v;
3567
3568 if (!PyUnicode_Check(unicode)) {
3569 PyErr_BadArgument();
3570 goto onError;
3571 }
3572
Serhiy Storchaka00939072016-10-27 21:05:49 +03003573 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3574 "PyUnicode_AsEncodedUnicode() is deprecated; "
3575 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3576 return NULL;
3577
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003578 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003579 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003580
3581 /* Encode via the codec registry */
3582 v = PyCodec_Encode(unicode, encoding, errors);
3583 if (v == NULL)
3584 goto onError;
3585 if (!PyUnicode_Check(v)) {
3586 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003587 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3588 "use codecs.encode() to encode to arbitrary types",
3589 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003590 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003591 Py_DECREF(v);
3592 goto onError;
3593 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003594 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003595
Benjamin Peterson29060642009-01-31 22:14:21 +00003596 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003597 return NULL;
3598}
3599
Victor Stinner2cba6b82018-01-10 22:46:15 +01003600static PyObject*
3601unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
3602 int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003603{
Victor Stinner1b579672011-12-17 05:47:23 +01003604 int surrogateescape;
Victor Stinner1b579672011-12-17 05:47:23 +01003605 if (locale_error_handler(errors, &surrogateescape) < 0)
3606 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003607
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003608 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3609 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003610 return NULL;
3611 }
3612
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003613 wchar_t *wstr;
3614 size_t wlen;
3615 const char *reason;
3616 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3617 current_locale, surrogateescape);
3618 if (res != 0) {
3619 if (res == -2) {
3620 PyObject *exc;
3621 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3622 "locale", str, len,
3623 (Py_ssize_t)wlen,
3624 (Py_ssize_t)(wlen + 1),
3625 reason);
3626 if (exc != NULL) {
3627 PyCodec_StrictErrors(exc);
3628 Py_DECREF(exc);
3629 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003630 }
3631 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003632 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003633 }
Victor Stinner2f197072011-12-17 07:08:30 +01003634 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003635 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003636
3637 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3638 PyMem_RawFree(wstr);
3639 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003640}
3641
3642PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003643PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3644 const char *errors)
3645{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003646 return unicode_decode_locale(str, len, errors, 1);
3647}
3648
3649PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003650PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003651{
3652 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003653 return unicode_decode_locale(str, size, errors, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003654}
3655
3656
3657PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003658PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003659 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003660 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3661}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003662
Christian Heimes5894ba72007-11-04 11:43:14 +00003663PyObject*
3664PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3665{
Steve Dowercc16be82016-09-08 10:35:16 -07003666#if defined(__APPLE__)
3667 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003668#else
Victor Stinner793b5312011-04-27 00:24:21 +02003669 PyInterpreterState *interp = PyThreadState_GET()->interp;
3670 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3671 cannot use it to encode and decode filenames before it is loaded. Load
3672 the Python codec requires to encode at least its own filename. Use the C
3673 version of the locale codec until the codec registry is initialized and
3674 the Python codec is loaded.
3675
3676 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3677 cannot only rely on it: check also interp->fscodec_initialized for
3678 subinterpreters. */
3679 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003680 return PyUnicode_Decode(s, size,
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003681 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003682 Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003683 }
3684 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003685 return unicode_decode_locale(s, size,
3686 Py_FileSystemDefaultEncodeErrors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003687 }
Victor Stinnerad158722010-10-27 00:25:46 +00003688#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003689}
3690
Martin v. Löwis011e8422009-05-05 04:43:17 +00003691
3692int
3693PyUnicode_FSConverter(PyObject* arg, void* addr)
3694{
Brett Cannonec6ce872016-09-06 15:50:29 -07003695 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003696 PyObject *output = NULL;
3697 Py_ssize_t size;
3698 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003699 if (arg == NULL) {
3700 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003701 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003702 return 1;
3703 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003704 path = PyOS_FSPath(arg);
3705 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003706 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003707 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003708 if (PyBytes_Check(path)) {
3709 output = path;
3710 }
3711 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3712 output = PyUnicode_EncodeFSDefault(path);
3713 Py_DECREF(path);
3714 if (!output) {
3715 return 0;
3716 }
3717 assert(PyBytes_Check(output));
3718 }
3719
Victor Stinner0ea2a462010-04-30 00:22:08 +00003720 size = PyBytes_GET_SIZE(output);
3721 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003722 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003723 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003724 Py_DECREF(output);
3725 return 0;
3726 }
3727 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003728 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003729}
3730
3731
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003732int
3733PyUnicode_FSDecoder(PyObject* arg, void* addr)
3734{
Brett Cannona5711202016-09-06 19:36:01 -07003735 int is_buffer = 0;
3736 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003737 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003738 if (arg == NULL) {
3739 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003740 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003741 return 1;
3742 }
Brett Cannona5711202016-09-06 19:36:01 -07003743
3744 is_buffer = PyObject_CheckBuffer(arg);
3745 if (!is_buffer) {
3746 path = PyOS_FSPath(arg);
3747 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003748 return 0;
3749 }
Brett Cannona5711202016-09-06 19:36:01 -07003750 }
3751 else {
3752 path = arg;
3753 Py_INCREF(arg);
3754 }
3755
3756 if (PyUnicode_Check(path)) {
3757 if (PyUnicode_READY(path) == -1) {
3758 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003759 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003760 }
3761 output = path;
3762 }
3763 else if (PyBytes_Check(path) || is_buffer) {
3764 PyObject *path_bytes = NULL;
3765
3766 if (!PyBytes_Check(path) &&
3767 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3768 "path should be string, bytes, or os.PathLike, not %.200s",
3769 Py_TYPE(arg)->tp_name)) {
3770 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003771 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003772 }
3773 path_bytes = PyBytes_FromObject(path);
3774 Py_DECREF(path);
3775 if (!path_bytes) {
3776 return 0;
3777 }
3778 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3779 PyBytes_GET_SIZE(path_bytes));
3780 Py_DECREF(path_bytes);
3781 if (!output) {
3782 return 0;
3783 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003784 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003785 else {
3786 PyErr_Format(PyExc_TypeError,
Brett Cannona5711202016-09-06 19:36:01 -07003787 "path should be string, bytes, or os.PathLike, not %.200s",
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003788 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003789 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003790 return 0;
3791 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003792 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003793 Py_DECREF(output);
3794 return 0;
3795 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003796 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003797 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003798 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003799 Py_DECREF(output);
3800 return 0;
3801 }
3802 *(PyObject**)addr = output;
3803 return Py_CLEANUP_SUPPORTED;
3804}
3805
3806
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003807const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003808PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003809{
Christian Heimesf3863112007-11-22 07:46:41 +00003810 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003811
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003812 if (!PyUnicode_Check(unicode)) {
3813 PyErr_BadArgument();
3814 return NULL;
3815 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003816 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003817 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003818
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003819 if (PyUnicode_UTF8(unicode) == NULL) {
3820 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003821 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003822 if (bytes == NULL)
3823 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003824 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3825 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003826 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003827 Py_DECREF(bytes);
3828 return NULL;
3829 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003830 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003831 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003832 PyBytes_AS_STRING(bytes),
3833 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003834 Py_DECREF(bytes);
3835 }
3836
3837 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003838 *psize = PyUnicode_UTF8_LENGTH(unicode);
3839 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003840}
3841
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003842const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003843PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003844{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003845 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3846}
3847
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003848Py_UNICODE *
3849PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3850{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003851 const unsigned char *one_byte;
3852#if SIZEOF_WCHAR_T == 4
3853 const Py_UCS2 *two_bytes;
3854#else
3855 const Py_UCS4 *four_bytes;
3856 const Py_UCS4 *ucs4_end;
3857 Py_ssize_t num_surrogates;
3858#endif
3859 wchar_t *w;
3860 wchar_t *wchar_end;
3861
3862 if (!PyUnicode_Check(unicode)) {
3863 PyErr_BadArgument();
3864 return NULL;
3865 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003866 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003867 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003868 assert(_PyUnicode_KIND(unicode) != 0);
3869 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003870
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003871 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003872#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003873 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3874 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003875 num_surrogates = 0;
3876
3877 for (; four_bytes < ucs4_end; ++four_bytes) {
3878 if (*four_bytes > 0xFFFF)
3879 ++num_surrogates;
3880 }
3881
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003882 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3883 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3884 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003885 PyErr_NoMemory();
3886 return NULL;
3887 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003888 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003889
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003890 w = _PyUnicode_WSTR(unicode);
3891 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3892 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003893 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3894 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003895 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003896 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003897 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3898 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003899 }
3900 else
3901 *w = *four_bytes;
3902
3903 if (w > wchar_end) {
Barry Warsawb2e57942017-09-14 18:13:16 -07003904 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003905 }
3906 }
3907 *w = 0;
3908#else
3909 /* sizeof(wchar_t) == 4 */
3910 Py_FatalError("Impossible unicode object state, wstr and str "
3911 "should share memory already.");
3912 return NULL;
3913#endif
3914 }
3915 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02003916 if ((size_t)_PyUnicode_LENGTH(unicode) >
3917 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3918 PyErr_NoMemory();
3919 return NULL;
3920 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003921 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3922 (_PyUnicode_LENGTH(unicode) + 1));
3923 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003924 PyErr_NoMemory();
3925 return NULL;
3926 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003927 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3928 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3929 w = _PyUnicode_WSTR(unicode);
3930 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003931
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003932 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3933 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003934 for (; w < wchar_end; ++one_byte, ++w)
3935 *w = *one_byte;
3936 /* null-terminate the wstr */
3937 *w = 0;
3938 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003939 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003940#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003941 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003942 for (; w < wchar_end; ++two_bytes, ++w)
3943 *w = *two_bytes;
3944 /* null-terminate the wstr */
3945 *w = 0;
3946#else
3947 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003948 PyObject_FREE(_PyUnicode_WSTR(unicode));
3949 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003950 Py_FatalError("Impossible unicode object state, wstr "
3951 "and str should share memory already.");
3952 return NULL;
3953#endif
3954 }
3955 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07003956 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003957 }
3958 }
3959 }
3960 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003961 *size = PyUnicode_WSTR_LENGTH(unicode);
3962 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003963}
3964
Alexander Belopolsky40018472011-02-26 01:02:56 +00003965Py_UNICODE *
3966PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003967{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003968 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003969}
3970
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03003971const Py_UNICODE *
3972_PyUnicode_AsUnicode(PyObject *unicode)
3973{
3974 Py_ssize_t size;
3975 const Py_UNICODE *wstr;
3976
3977 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
3978 if (wstr && wcslen(wstr) != (size_t)size) {
3979 PyErr_SetString(PyExc_ValueError, "embedded null character");
3980 return NULL;
3981 }
3982 return wstr;
3983}
3984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003985
Alexander Belopolsky40018472011-02-26 01:02:56 +00003986Py_ssize_t
3987PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003988{
3989 if (!PyUnicode_Check(unicode)) {
3990 PyErr_BadArgument();
3991 goto onError;
3992 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003993 if (_PyUnicode_WSTR(unicode) == NULL) {
3994 if (PyUnicode_AsUnicode(unicode) == NULL)
3995 goto onError;
3996 }
3997 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003998
Benjamin Peterson29060642009-01-31 22:14:21 +00003999 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004000 return -1;
4001}
4002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004003Py_ssize_t
4004PyUnicode_GetLength(PyObject *unicode)
4005{
Victor Stinner07621332012-06-16 04:53:46 +02004006 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007 PyErr_BadArgument();
4008 return -1;
4009 }
Victor Stinner07621332012-06-16 04:53:46 +02004010 if (PyUnicode_READY(unicode) == -1)
4011 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004012 return PyUnicode_GET_LENGTH(unicode);
4013}
4014
4015Py_UCS4
4016PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4017{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004018 void *data;
4019 int kind;
4020
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004021 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004022 PyErr_BadArgument();
4023 return (Py_UCS4)-1;
4024 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004025 if (PyUnicode_READY(unicode) == -1) {
4026 return (Py_UCS4)-1;
4027 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004028 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004029 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004030 return (Py_UCS4)-1;
4031 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004032 data = PyUnicode_DATA(unicode);
4033 kind = PyUnicode_KIND(unicode);
4034 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004035}
4036
4037int
4038PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4039{
4040 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004041 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004042 return -1;
4043 }
Victor Stinner488fa492011-12-12 00:01:39 +01004044 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004045 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004046 PyErr_SetString(PyExc_IndexError, "string index out of range");
4047 return -1;
4048 }
Victor Stinner488fa492011-12-12 00:01:39 +01004049 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004050 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004051 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4052 PyErr_SetString(PyExc_ValueError, "character out of range");
4053 return -1;
4054 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004055 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4056 index, ch);
4057 return 0;
4058}
4059
Alexander Belopolsky40018472011-02-26 01:02:56 +00004060const char *
4061PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004062{
Victor Stinner42cb4622010-09-01 19:39:01 +00004063 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004064}
4065
Victor Stinner554f3f02010-06-16 23:33:54 +00004066/* create or adjust a UnicodeDecodeError */
4067static void
4068make_decode_exception(PyObject **exceptionObject,
4069 const char *encoding,
4070 const char *input, Py_ssize_t length,
4071 Py_ssize_t startpos, Py_ssize_t endpos,
4072 const char *reason)
4073{
4074 if (*exceptionObject == NULL) {
4075 *exceptionObject = PyUnicodeDecodeError_Create(
4076 encoding, input, length, startpos, endpos, reason);
4077 }
4078 else {
4079 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4080 goto onError;
4081 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4082 goto onError;
4083 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4084 goto onError;
4085 }
4086 return;
4087
4088onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004089 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004090}
4091
Steve Dowercc16be82016-09-08 10:35:16 -07004092#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004093/* error handling callback helper:
4094 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004095 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004096 and adjust various state variables.
4097 return 0 on success, -1 on error
4098*/
4099
Alexander Belopolsky40018472011-02-26 01:02:56 +00004100static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004101unicode_decode_call_errorhandler_wchar(
4102 const char *errors, PyObject **errorHandler,
4103 const char *encoding, const char *reason,
4104 const char **input, const char **inend, Py_ssize_t *startinpos,
4105 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4106 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004107{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004108 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004109
4110 PyObject *restuple = NULL;
4111 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004112 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004113 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004114 Py_ssize_t requiredsize;
4115 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004116 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004117 wchar_t *repwstr;
4118 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004119
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004120 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4121 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004122
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004124 *errorHandler = PyCodec_LookupError(errors);
4125 if (*errorHandler == NULL)
4126 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004127 }
4128
Victor Stinner554f3f02010-06-16 23:33:54 +00004129 make_decode_exception(exceptionObject,
4130 encoding,
4131 *input, *inend - *input,
4132 *startinpos, *endinpos,
4133 reason);
4134 if (*exceptionObject == NULL)
4135 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004136
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004137 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004138 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004139 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004140 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004141 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004142 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004143 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004144 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004145 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004146
4147 /* Copy back the bytes variables, which might have been modified by the
4148 callback */
4149 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4150 if (!inputobj)
4151 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004152 *input = PyBytes_AS_STRING(inputobj);
4153 insize = PyBytes_GET_SIZE(inputobj);
4154 *inend = *input + insize;
4155 /* we can DECREF safely, as the exception has another reference,
4156 so the object won't go away. */
4157 Py_DECREF(inputobj);
4158
4159 if (newpos<0)
4160 newpos = insize+newpos;
4161 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004162 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004163 goto onError;
4164 }
4165
4166 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4167 if (repwstr == NULL)
4168 goto onError;
4169 /* need more space? (at least enough for what we
4170 have+the replacement+the rest of the string (starting
4171 at the new input position), so we won't have to check space
4172 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004173 requiredsize = *outpos;
4174 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4175 goto overflow;
4176 requiredsize += repwlen;
4177 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4178 goto overflow;
4179 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004180 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004181 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004182 requiredsize = 2*outsize;
4183 if (unicode_resize(output, requiredsize) < 0)
4184 goto onError;
4185 }
4186 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4187 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004188 *endinpos = newpos;
4189 *inptr = *input + newpos;
4190
4191 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004192 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004193 return 0;
4194
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004195 overflow:
4196 PyErr_SetString(PyExc_OverflowError,
4197 "decoded result is too long for a Python string");
4198
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004199 onError:
4200 Py_XDECREF(restuple);
4201 return -1;
4202}
Steve Dowercc16be82016-09-08 10:35:16 -07004203#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004204
4205static int
4206unicode_decode_call_errorhandler_writer(
4207 const char *errors, PyObject **errorHandler,
4208 const char *encoding, const char *reason,
4209 const char **input, const char **inend, Py_ssize_t *startinpos,
4210 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4211 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4212{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004213 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004214
4215 PyObject *restuple = NULL;
4216 PyObject *repunicode = NULL;
4217 Py_ssize_t insize;
4218 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004219 Py_ssize_t replen;
Xiang Zhang86fdad02018-01-31 20:48:05 +08004220 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004221 PyObject *inputobj = NULL;
Xiang Zhang86fdad02018-01-31 20:48:05 +08004222 int need_to_grow = 0;
4223 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004224
4225 if (*errorHandler == NULL) {
4226 *errorHandler = PyCodec_LookupError(errors);
4227 if (*errorHandler == NULL)
4228 goto onError;
4229 }
4230
4231 make_decode_exception(exceptionObject,
4232 encoding,
4233 *input, *inend - *input,
4234 *startinpos, *endinpos,
4235 reason);
4236 if (*exceptionObject == NULL)
4237 goto onError;
4238
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004239 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004240 if (restuple == NULL)
4241 goto onError;
4242 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004243 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004244 goto onError;
4245 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004246 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004247 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004248
4249 /* Copy back the bytes variables, which might have been modified by the
4250 callback */
4251 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4252 if (!inputobj)
4253 goto onError;
Xiang Zhang86fdad02018-01-31 20:48:05 +08004254 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004255 *input = PyBytes_AS_STRING(inputobj);
4256 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004257 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004258 /* we can DECREF safely, as the exception has another reference,
4259 so the object won't go away. */
4260 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004261
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004262 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004263 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004264 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004265 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004266 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004267 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004268
Victor Stinner170ca6f2013-04-18 00:25:28 +02004269 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004270 if (replen > 1) {
4271 writer->min_length += replen - 1;
Xiang Zhang86fdad02018-01-31 20:48:05 +08004272 need_to_grow = 1;
4273 }
4274 new_inptr = *input + newpos;
4275 if (*inend - new_inptr > remain) {
4276 /* We don't know the decoding algorithm here so we make the worst
4277 assumption that one byte decodes to one unicode character.
4278 If unfortunately one byte could decode to more unicode characters,
4279 the decoder may write out-of-bound then. Is it possible for the
4280 algorithms using this function? */
4281 writer->min_length += *inend - new_inptr - remain;
4282 need_to_grow = 1;
4283 }
4284 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004285 writer->overallocate = 1;
Miss Islington (bot)09819ef2018-02-12 23:15:57 -08004286 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004287 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4288 goto onError;
4289 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004290 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004291 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004292
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004293 *endinpos = newpos;
Xiang Zhang86fdad02018-01-31 20:48:05 +08004294 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004295
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004296 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004297 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004298 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004299
Benjamin Peterson29060642009-01-31 22:14:21 +00004300 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004301 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004302 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004303}
4304
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004305/* --- UTF-7 Codec -------------------------------------------------------- */
4306
Antoine Pitrou244651a2009-05-04 18:56:13 +00004307/* See RFC2152 for details. We encode conservatively and decode liberally. */
4308
4309/* Three simple macros defining base-64. */
4310
4311/* Is c a base-64 character? */
4312
4313#define IS_BASE64(c) \
4314 (((c) >= 'A' && (c) <= 'Z') || \
4315 ((c) >= 'a' && (c) <= 'z') || \
4316 ((c) >= '0' && (c) <= '9') || \
4317 (c) == '+' || (c) == '/')
4318
4319/* given that c is a base-64 character, what is its base-64 value? */
4320
4321#define FROM_BASE64(c) \
4322 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4323 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4324 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4325 (c) == '+' ? 62 : 63)
4326
4327/* What is the base-64 character of the bottom 6 bits of n? */
4328
4329#define TO_BASE64(n) \
4330 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4331
4332/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4333 * decoded as itself. We are permissive on decoding; the only ASCII
4334 * byte not decoding to itself is the + which begins a base64
4335 * string. */
4336
4337#define DECODE_DIRECT(c) \
4338 ((c) <= 127 && (c) != '+')
4339
4340/* The UTF-7 encoder treats ASCII characters differently according to
4341 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4342 * the above). See RFC2152. This array identifies these different
4343 * sets:
4344 * 0 : "Set D"
4345 * alphanumeric and '(),-./:?
4346 * 1 : "Set O"
4347 * !"#$%&*;<=>@[]^_`{|}
4348 * 2 : "whitespace"
4349 * ht nl cr sp
4350 * 3 : special (must be base64 encoded)
4351 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4352 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004353
Tim Petersced69f82003-09-16 20:30:58 +00004354static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004355char utf7_category[128] = {
4356/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4357 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4358/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4359 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4360/* sp ! " # $ % & ' ( ) * + , - . / */
4361 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4362/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4363 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4364/* @ A B C D E F G H I J K L M N O */
4365 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4366/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4367 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4368/* ` a b c d e f g h i j k l m n o */
4369 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4370/* p q r s t u v w x y z { | } ~ del */
4371 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004372};
4373
Antoine Pitrou244651a2009-05-04 18:56:13 +00004374/* ENCODE_DIRECT: this character should be encoded as itself. The
4375 * answer depends on whether we are encoding set O as itself, and also
4376 * on whether we are encoding whitespace as itself. RFC2152 makes it
4377 * clear that the answers to these questions vary between
4378 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004379
Antoine Pitrou244651a2009-05-04 18:56:13 +00004380#define ENCODE_DIRECT(c, directO, directWS) \
4381 ((c) < 128 && (c) > 0 && \
4382 ((utf7_category[(c)] == 0) || \
4383 (directWS && (utf7_category[(c)] == 2)) || \
4384 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004385
Alexander Belopolsky40018472011-02-26 01:02:56 +00004386PyObject *
4387PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004388 Py_ssize_t size,
4389 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004390{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004391 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4392}
4393
Antoine Pitrou244651a2009-05-04 18:56:13 +00004394/* The decoder. The only state we preserve is our read position,
4395 * i.e. how many characters we have consumed. So if we end in the
4396 * middle of a shift sequence we have to back off the read position
4397 * and the output to the beginning of the sequence, otherwise we lose
4398 * all the shift state (seen bits, number of bits seen, high
4399 * surrogate). */
4400
Alexander Belopolsky40018472011-02-26 01:02:56 +00004401PyObject *
4402PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004403 Py_ssize_t size,
4404 const char *errors,
4405 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004406{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004408 Py_ssize_t startinpos;
4409 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004410 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004411 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004412 const char *errmsg = "";
4413 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004414 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004415 unsigned int base64bits = 0;
4416 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004417 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004418 PyObject *errorHandler = NULL;
4419 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004420
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004421 if (size == 0) {
4422 if (consumed)
4423 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004424 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004425 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004426
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004427 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004428 _PyUnicodeWriter_Init(&writer);
4429 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004430
4431 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004432 e = s + size;
4433
4434 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004435 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004436 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004437 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004438
Antoine Pitrou244651a2009-05-04 18:56:13 +00004439 if (inShift) { /* in a base-64 section */
4440 if (IS_BASE64(ch)) { /* consume a base-64 character */
4441 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4442 base64bits += 6;
4443 s++;
4444 if (base64bits >= 16) {
4445 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004446 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004447 base64bits -= 16;
4448 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004449 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004450 if (surrogate) {
4451 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004452 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4453 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004454 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004455 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004456 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004457 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004458 }
4459 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004460 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004461 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004462 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004463 }
4464 }
Victor Stinner551ac952011-11-29 22:58:13 +01004465 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004466 /* first surrogate */
4467 surrogate = outCh;
4468 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004469 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004470 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004471 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004472 }
4473 }
4474 }
4475 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004476 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004477 if (base64bits > 0) { /* left-over bits */
4478 if (base64bits >= 6) {
4479 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004480 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004481 errmsg = "partial character in shift sequence";
4482 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004483 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004484 else {
4485 /* Some bits remain; they should be zero */
4486 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004487 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004488 errmsg = "non-zero padding bits in shift sequence";
4489 goto utf7Error;
4490 }
4491 }
4492 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004493 if (surrogate && DECODE_DIRECT(ch)) {
4494 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4495 goto onError;
4496 }
4497 surrogate = 0;
4498 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004499 /* '-' is absorbed; other terminating
4500 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004501 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004502 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004503 }
4504 }
4505 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004506 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004507 s++; /* consume '+' */
4508 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004509 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004510 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004511 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004512 }
4513 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004514 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004515 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004516 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004517 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004518 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004519 }
4520 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004521 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004522 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004523 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004524 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004525 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004526 else {
4527 startinpos = s-starts;
4528 s++;
4529 errmsg = "unexpected special character";
4530 goto utf7Error;
4531 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004532 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004533utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004534 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004535 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004536 errors, &errorHandler,
4537 "utf7", errmsg,
4538 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004539 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004540 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004541 }
4542
Antoine Pitrou244651a2009-05-04 18:56:13 +00004543 /* end of string */
4544
4545 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4546 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004547 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004548 if (surrogate ||
4549 (base64bits >= 6) ||
4550 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004551 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004552 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004553 errors, &errorHandler,
4554 "utf7", "unterminated shift sequence",
4555 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004556 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004557 goto onError;
4558 if (s < e)
4559 goto restart;
4560 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004561 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004562
4563 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004564 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004565 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004566 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004567 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004568 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004569 writer.kind, writer.data, shiftOutStart);
4570 Py_XDECREF(errorHandler);
4571 Py_XDECREF(exc);
4572 _PyUnicodeWriter_Dealloc(&writer);
4573 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004574 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004575 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004576 }
4577 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004578 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004579 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004580 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004581
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004582 Py_XDECREF(errorHandler);
4583 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004584 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004585
Benjamin Peterson29060642009-01-31 22:14:21 +00004586 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004587 Py_XDECREF(errorHandler);
4588 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004589 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004590 return NULL;
4591}
4592
4593
Alexander Belopolsky40018472011-02-26 01:02:56 +00004594PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004595_PyUnicode_EncodeUTF7(PyObject *str,
4596 int base64SetO,
4597 int base64WhiteSpace,
4598 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004599{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004600 int kind;
4601 void *data;
4602 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004603 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004604 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004605 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004606 unsigned int base64bits = 0;
4607 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004608 char * out;
4609 char * start;
4610
Benjamin Petersonbac79492012-01-14 13:34:47 -05004611 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004612 return NULL;
4613 kind = PyUnicode_KIND(str);
4614 data = PyUnicode_DATA(str);
4615 len = PyUnicode_GET_LENGTH(str);
4616
4617 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004618 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004619
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004620 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004621 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004622 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004623 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004624 if (v == NULL)
4625 return NULL;
4626
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004627 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004628 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004629 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004630
Antoine Pitrou244651a2009-05-04 18:56:13 +00004631 if (inShift) {
4632 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4633 /* shifting out */
4634 if (base64bits) { /* output remaining bits */
4635 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4636 base64buffer = 0;
4637 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004638 }
4639 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004640 /* Characters not in the BASE64 set implicitly unshift the sequence
4641 so no '-' is required, except if the character is itself a '-' */
4642 if (IS_BASE64(ch) || ch == '-') {
4643 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004644 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004645 *out++ = (char) ch;
4646 }
4647 else {
4648 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004649 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004650 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004651 else { /* not in a shift sequence */
4652 if (ch == '+') {
4653 *out++ = '+';
4654 *out++ = '-';
4655 }
4656 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4657 *out++ = (char) ch;
4658 }
4659 else {
4660 *out++ = '+';
4661 inShift = 1;
4662 goto encode_char;
4663 }
4664 }
4665 continue;
4666encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004667 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004668 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004669
Antoine Pitrou244651a2009-05-04 18:56:13 +00004670 /* code first surrogate */
4671 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004672 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004673 while (base64bits >= 6) {
4674 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4675 base64bits -= 6;
4676 }
4677 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004678 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004679 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004680 base64bits += 16;
4681 base64buffer = (base64buffer << 16) | ch;
4682 while (base64bits >= 6) {
4683 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4684 base64bits -= 6;
4685 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004686 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004687 if (base64bits)
4688 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4689 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004690 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004691 if (_PyBytes_Resize(&v, out - start) < 0)
4692 return NULL;
4693 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004694}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004695PyObject *
4696PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4697 Py_ssize_t size,
4698 int base64SetO,
4699 int base64WhiteSpace,
4700 const char *errors)
4701{
4702 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004703 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004704 if (tmp == NULL)
4705 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004706 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004707 base64WhiteSpace, errors);
4708 Py_DECREF(tmp);
4709 return result;
4710}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004711
Antoine Pitrou244651a2009-05-04 18:56:13 +00004712#undef IS_BASE64
4713#undef FROM_BASE64
4714#undef TO_BASE64
4715#undef DECODE_DIRECT
4716#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004717
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718/* --- UTF-8 Codec -------------------------------------------------------- */
4719
Alexander Belopolsky40018472011-02-26 01:02:56 +00004720PyObject *
4721PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004722 Py_ssize_t size,
4723 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724{
Walter Dörwald69652032004-09-07 20:24:22 +00004725 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4726}
4727
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004728#include "stringlib/asciilib.h"
4729#include "stringlib/codecs.h"
4730#include "stringlib/undef.h"
4731
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004732#include "stringlib/ucs1lib.h"
4733#include "stringlib/codecs.h"
4734#include "stringlib/undef.h"
4735
4736#include "stringlib/ucs2lib.h"
4737#include "stringlib/codecs.h"
4738#include "stringlib/undef.h"
4739
4740#include "stringlib/ucs4lib.h"
4741#include "stringlib/codecs.h"
4742#include "stringlib/undef.h"
4743
Antoine Pitrouab868312009-01-10 15:40:25 +00004744/* Mask to quickly check whether a C 'long' contains a
4745 non-ASCII, UTF8-encoded char. */
4746#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004747# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004748#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004749# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004750#else
4751# error C 'long' size should be either 4 or 8!
4752#endif
4753
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004754static Py_ssize_t
4755ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004756{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004757 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004758 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004759
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004760 /*
4761 * Issue #17237: m68k is a bit different from most architectures in
4762 * that objects do not use "natural alignment" - for example, int and
4763 * long are only aligned at 2-byte boundaries. Therefore the assert()
4764 * won't work; also, tests have shown that skipping the "optimised
4765 * version" will even speed up m68k.
4766 */
4767#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004768#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004769 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4770 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004771 /* Fast path, see in STRINGLIB(utf8_decode) for
4772 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004773 /* Help allocation */
4774 const char *_p = p;
4775 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004776 while (_p < aligned_end) {
4777 unsigned long value = *(const unsigned long *) _p;
4778 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004779 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004780 *((unsigned long *)q) = value;
4781 _p += SIZEOF_LONG;
4782 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004783 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004784 p = _p;
4785 while (p < end) {
4786 if ((unsigned char)*p & 0x80)
4787 break;
4788 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004790 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004792#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004793#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004794 while (p < end) {
4795 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4796 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004797 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004798 /* Help allocation */
4799 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004800 while (_p < aligned_end) {
4801 unsigned long value = *(unsigned long *) _p;
4802 if (value & ASCII_CHAR_MASK)
4803 break;
4804 _p += SIZEOF_LONG;
4805 }
4806 p = _p;
4807 if (_p == end)
4808 break;
4809 }
4810 if ((unsigned char)*p & 0x80)
4811 break;
4812 ++p;
4813 }
4814 memcpy(dest, start, p - start);
4815 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816}
Antoine Pitrouab868312009-01-10 15:40:25 +00004817
Victor Stinner785938e2011-12-11 20:09:03 +01004818PyObject *
4819PyUnicode_DecodeUTF8Stateful(const char *s,
4820 Py_ssize_t size,
4821 const char *errors,
4822 Py_ssize_t *consumed)
4823{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004824 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004825 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004826 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004827
4828 Py_ssize_t startinpos;
4829 Py_ssize_t endinpos;
4830 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004831 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004832 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004833 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004834
4835 if (size == 0) {
4836 if (consumed)
4837 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004838 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004839 }
4840
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004841 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4842 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004843 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004844 *consumed = 1;
4845 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004846 }
4847
Victor Stinner8f674cc2013-04-17 23:02:17 +02004848 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004849 writer.min_length = size;
4850 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004851 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004852
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004853 writer.pos = ascii_decode(s, end, writer.data);
4854 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004855 while (s < end) {
4856 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004857 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004858
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004859 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004860 if (PyUnicode_IS_ASCII(writer.buffer))
4861 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004862 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004863 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004864 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004865 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004866 } else {
4867 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004868 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004869 }
4870
4871 switch (ch) {
4872 case 0:
4873 if (s == end || consumed)
4874 goto End;
4875 errmsg = "unexpected end of data";
4876 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004877 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004878 break;
4879 case 1:
4880 errmsg = "invalid start byte";
4881 startinpos = s - starts;
4882 endinpos = startinpos + 1;
4883 break;
4884 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004885 case 3:
4886 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004887 errmsg = "invalid continuation byte";
4888 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004889 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004890 break;
4891 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004892 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004893 goto onError;
4894 continue;
4895 }
4896
Victor Stinner1d65d912015-10-05 13:43:50 +02004897 if (error_handler == _Py_ERROR_UNKNOWN)
4898 error_handler = get_error_handler(errors);
4899
4900 switch (error_handler) {
4901 case _Py_ERROR_IGNORE:
4902 s += (endinpos - startinpos);
4903 break;
4904
4905 case _Py_ERROR_REPLACE:
4906 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4907 goto onError;
4908 s += (endinpos - startinpos);
4909 break;
4910
4911 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004912 {
4913 Py_ssize_t i;
4914
Victor Stinner1d65d912015-10-05 13:43:50 +02004915 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4916 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004917 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004918 ch = (Py_UCS4)(unsigned char)(starts[i]);
4919 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4920 ch + 0xdc00);
4921 writer.pos++;
4922 }
4923 s += (endinpos - startinpos);
4924 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004925 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004926
4927 default:
4928 if (unicode_decode_call_errorhandler_writer(
4929 errors, &error_handler_obj,
4930 "utf-8", errmsg,
4931 &starts, &end, &startinpos, &endinpos, &exc, &s,
4932 &writer))
4933 goto onError;
4934 }
Victor Stinner785938e2011-12-11 20:09:03 +01004935 }
4936
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004937End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004938 if (consumed)
4939 *consumed = s - starts;
4940
Victor Stinner1d65d912015-10-05 13:43:50 +02004941 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004942 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004943 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004944
4945onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02004946 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004947 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004948 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004949 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004950}
4951
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004952
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004953/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
4954 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004955
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004956 On success, write a pointer to a newly allocated wide character string into
4957 *wstr (use PyMem_RawFree() to free the memory) and write the output length
4958 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004959
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004960 On memory allocation failure, return -1.
4961
4962 On decoding error (if surrogateescape is zero), return -2. If wlen is
4963 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
4964 is not NULL, write the decoding error message into *reason. */
4965int
4966_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
4967 const char **reason, int surrogateescape)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004968{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004969 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004970 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004971 wchar_t *unicode;
4972 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004973
4974 /* Note: size will always be longer than the resulting Unicode
4975 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01004976 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004977 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004978 }
4979
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004980 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01004981 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004982 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004983 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004984
4985 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004986 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004987 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004988 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004989 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004990#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004991 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004992#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004993 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004994#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004995 if (ch > 0xFF) {
4996#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07004997 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004998#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02004999 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005000 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005001 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5002 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5003#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005004 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005005 else {
5006 if (!ch && s == e)
5007 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005008 if (!surrogateescape) {
5009 PyMem_RawFree(unicode );
5010 if (reason != NULL) {
5011 switch (ch) {
5012 case 0:
5013 *reason = "unexpected end of data";
5014 break;
5015 case 1:
5016 *reason = "invalid start byte";
5017 break;
5018 /* 2, 3, 4 */
5019 default:
5020 *reason = "invalid continuation byte";
5021 break;
5022 }
5023 }
5024 if (wlen != NULL) {
5025 *wlen = s - orig_s;
5026 }
5027 return -2;
5028 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005029 /* surrogateescape */
5030 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5031 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005032 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005033 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005034 if (wlen) {
5035 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005036 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005037 *wstr = unicode;
5038 return 0;
5039}
5040
5041wchar_t*
5042_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
5043{
5044 wchar_t *wstr;
5045 int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1);
5046 if (res != 0) {
5047 return NULL;
5048 }
5049 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005050}
5051
Antoine Pitrouab868312009-01-10 15:40:25 +00005052
Victor Stinnere47e6982017-12-21 15:45:16 +01005053/* UTF-8 encoder using the surrogateescape error handler .
5054
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005055 On success, return 0 and write the newly allocated character string (use
5056 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005057
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005058 On encoding failure, return -2 and write the position of the invalid
5059 surrogate character into *error_pos (if error_pos is set) and the decoding
5060 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005061
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005062 On memory allocation failure, return -1. */
5063int
5064_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5065 const char **reason, int raw_malloc, int surrogateescape)
Victor Stinnere47e6982017-12-21 15:45:16 +01005066{
5067 const Py_ssize_t max_char_size = 4;
5068 Py_ssize_t len = wcslen(text);
5069
5070 assert(len >= 0);
5071
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005072 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5073 return -1;
5074 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005075 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005076 if (raw_malloc) {
5077 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005078 }
5079 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005080 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005081 }
5082 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005083 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005084 }
5085
5086 char *p = bytes;
5087 Py_ssize_t i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005088 for (i = 0; i < len; i++) {
5089 Py_UCS4 ch = text[i];
Victor Stinnere47e6982017-12-21 15:45:16 +01005090
5091 if (ch < 0x80) {
5092 /* Encode ASCII */
5093 *p++ = (char) ch;
5094
5095 }
5096 else if (ch < 0x0800) {
5097 /* Encode Latin-1 */
5098 *p++ = (char)(0xc0 | (ch >> 6));
5099 *p++ = (char)(0x80 | (ch & 0x3f));
5100 }
5101 else if (Py_UNICODE_IS_SURROGATE(ch)) {
5102 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005103 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005104 if (error_pos != NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005105 *error_pos = (size_t)i;
Victor Stinnere47e6982017-12-21 15:45:16 +01005106 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005107 if (reason != NULL) {
5108 *reason = "encoding error";
5109 }
5110 if (raw_malloc) {
5111 PyMem_RawFree(bytes);
5112 }
5113 else {
5114 PyMem_Free(bytes);
5115 }
5116 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005117 }
5118 *p++ = (char)(ch & 0xff);
5119 }
5120 else if (ch < 0x10000) {
5121 *p++ = (char)(0xe0 | (ch >> 12));
5122 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5123 *p++ = (char)(0x80 | (ch & 0x3f));
5124 }
5125 else { /* ch >= 0x10000 */
5126 assert(ch <= MAX_UNICODE);
5127 /* Encode UCS4 Unicode ordinals */
5128 *p++ = (char)(0xf0 | (ch >> 18));
5129 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5130 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5131 *p++ = (char)(0x80 | (ch & 0x3f));
5132 }
5133 }
5134 *p++ = '\0';
5135
5136 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005137 char *bytes2;
5138 if (raw_malloc) {
5139 bytes2 = PyMem_RawRealloc(bytes, final_size);
5140 }
5141 else {
5142 bytes2 = PyMem_Realloc(bytes, final_size);
5143 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005144 if (bytes2 == NULL) {
5145 if (error_pos != NULL) {
5146 *error_pos = (size_t)-1;
5147 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005148 if (raw_malloc) {
5149 PyMem_RawFree(bytes);
5150 }
5151 else {
5152 PyMem_Free(bytes);
5153 }
5154 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005155 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005156 *str = bytes2;
5157 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005158}
5159
5160
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005161/* Primary internal function which creates utf8 encoded bytes objects.
5162
5163 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005164 and allocate exactly as much space needed at the end. Else allocate the
5165 maximum possible needed (4 result bytes per Unicode character), and return
5166 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005167*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005168PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005169_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170{
Victor Stinner6099a032011-12-18 14:22:26 +01005171 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005172 void *data;
5173 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005175 if (!PyUnicode_Check(unicode)) {
5176 PyErr_BadArgument();
5177 return NULL;
5178 }
5179
5180 if (PyUnicode_READY(unicode) == -1)
5181 return NULL;
5182
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005183 if (PyUnicode_UTF8(unicode))
5184 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5185 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005186
5187 kind = PyUnicode_KIND(unicode);
5188 data = PyUnicode_DATA(unicode);
5189 size = PyUnicode_GET_LENGTH(unicode);
5190
Benjamin Petersonead6b532011-12-20 17:23:42 -06005191 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005192 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005193 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005194 case PyUnicode_1BYTE_KIND:
5195 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5196 assert(!PyUnicode_IS_ASCII(unicode));
5197 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5198 case PyUnicode_2BYTE_KIND:
5199 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5200 case PyUnicode_4BYTE_KIND:
5201 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005202 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203}
5204
Alexander Belopolsky40018472011-02-26 01:02:56 +00005205PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005206PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5207 Py_ssize_t size,
5208 const char *errors)
5209{
5210 PyObject *v, *unicode;
5211
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005212 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005213 if (unicode == NULL)
5214 return NULL;
5215 v = _PyUnicode_AsUTF8String(unicode, errors);
5216 Py_DECREF(unicode);
5217 return v;
5218}
5219
5220PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005221PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005223 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224}
5225
Walter Dörwald41980ca2007-08-16 21:55:45 +00005226/* --- UTF-32 Codec ------------------------------------------------------- */
5227
5228PyObject *
5229PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005230 Py_ssize_t size,
5231 const char *errors,
5232 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005233{
5234 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5235}
5236
5237PyObject *
5238PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 Py_ssize_t size,
5240 const char *errors,
5241 int *byteorder,
5242 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005243{
5244 const char *starts = s;
5245 Py_ssize_t startinpos;
5246 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005247 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005248 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005249 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005250 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005251 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005252 PyObject *errorHandler = NULL;
5253 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005254
Walter Dörwald41980ca2007-08-16 21:55:45 +00005255 q = (unsigned char *)s;
5256 e = q + size;
5257
5258 if (byteorder)
5259 bo = *byteorder;
5260
5261 /* Check for BOM marks (U+FEFF) in the input and adjust current
5262 byte order setting accordingly. In native mode, the leading BOM
5263 mark is skipped, in all other modes, it is copied to the output
5264 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005265 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005266 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005267 if (bom == 0x0000FEFF) {
5268 bo = -1;
5269 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005270 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005271 else if (bom == 0xFFFE0000) {
5272 bo = 1;
5273 q += 4;
5274 }
5275 if (byteorder)
5276 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005277 }
5278
Victor Stinnere64322e2012-10-30 23:12:47 +01005279 if (q == e) {
5280 if (consumed)
5281 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005282 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005283 }
5284
Victor Stinnere64322e2012-10-30 23:12:47 +01005285#ifdef WORDS_BIGENDIAN
5286 le = bo < 0;
5287#else
5288 le = bo <= 0;
5289#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005290 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005291
Victor Stinner8f674cc2013-04-17 23:02:17 +02005292 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005293 writer.min_length = (e - q + 3) / 4;
5294 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005295 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005296
Victor Stinnere64322e2012-10-30 23:12:47 +01005297 while (1) {
5298 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005299 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005300
Victor Stinnere64322e2012-10-30 23:12:47 +01005301 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005302 enum PyUnicode_Kind kind = writer.kind;
5303 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005304 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005305 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005306 if (le) {
5307 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005308 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005309 if (ch > maxch)
5310 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005311 if (kind != PyUnicode_1BYTE_KIND &&
5312 Py_UNICODE_IS_SURROGATE(ch))
5313 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005314 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005315 q += 4;
5316 } while (q <= last);
5317 }
5318 else {
5319 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005320 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005321 if (ch > maxch)
5322 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005323 if (kind != PyUnicode_1BYTE_KIND &&
5324 Py_UNICODE_IS_SURROGATE(ch))
5325 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005326 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005327 q += 4;
5328 } while (q <= last);
5329 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005330 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005331 }
5332
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005333 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005334 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005335 startinpos = ((const char *)q) - starts;
5336 endinpos = startinpos + 4;
5337 }
5338 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005339 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005340 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005341 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005342 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005343 startinpos = ((const char *)q) - starts;
5344 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005345 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005346 else {
5347 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005348 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005349 goto onError;
5350 q += 4;
5351 continue;
5352 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005353 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005354 startinpos = ((const char *)q) - starts;
5355 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005356 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005357
5358 /* The remaining input chars are ignored if the callback
5359 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005360 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005361 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005362 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005363 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005364 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005366 }
5367
Walter Dörwald41980ca2007-08-16 21:55:45 +00005368 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005369 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005370
Walter Dörwald41980ca2007-08-16 21:55:45 +00005371 Py_XDECREF(errorHandler);
5372 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005373 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005374
Benjamin Peterson29060642009-01-31 22:14:21 +00005375 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005376 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005377 Py_XDECREF(errorHandler);
5378 Py_XDECREF(exc);
5379 return NULL;
5380}
5381
5382PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005383_PyUnicode_EncodeUTF32(PyObject *str,
5384 const char *errors,
5385 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005386{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005387 enum PyUnicode_Kind kind;
5388 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005389 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005390 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005391 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005392#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005393 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005394#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005395 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005396#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005397 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005398 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005399 PyObject *errorHandler = NULL;
5400 PyObject *exc = NULL;
5401 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005402
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005403 if (!PyUnicode_Check(str)) {
5404 PyErr_BadArgument();
5405 return NULL;
5406 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005407 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005408 return NULL;
5409 kind = PyUnicode_KIND(str);
5410 data = PyUnicode_DATA(str);
5411 len = PyUnicode_GET_LENGTH(str);
5412
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005413 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005414 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005415 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005416 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005417 if (v == NULL)
5418 return NULL;
5419
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005420 /* output buffer is 4-bytes aligned */
5421 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005422 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005423 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005424 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005425 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005426 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005427
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005428 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005429 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005430 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005431 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005432 else
5433 encoding = "utf-32";
5434
5435 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005436 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5437 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005438 }
5439
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005440 pos = 0;
5441 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005442 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005443
5444 if (kind == PyUnicode_2BYTE_KIND) {
5445 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5446 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005447 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005448 else {
5449 assert(kind == PyUnicode_4BYTE_KIND);
5450 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5451 &out, native_ordering);
5452 }
5453 if (pos == len)
5454 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005455
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005456 rep = unicode_encode_call_errorhandler(
5457 errors, &errorHandler,
5458 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005459 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005460 if (!rep)
5461 goto error;
5462
5463 if (PyBytes_Check(rep)) {
5464 repsize = PyBytes_GET_SIZE(rep);
5465 if (repsize & 3) {
5466 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005467 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005468 "surrogates not allowed");
5469 goto error;
5470 }
5471 moreunits = repsize / 4;
5472 }
5473 else {
5474 assert(PyUnicode_Check(rep));
5475 if (PyUnicode_READY(rep) < 0)
5476 goto error;
5477 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5478 if (!PyUnicode_IS_ASCII(rep)) {
5479 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005480 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005481 "surrogates not allowed");
5482 goto error;
5483 }
5484 }
5485
5486 /* four bytes are reserved for each surrogate */
5487 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005488 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005489 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005490 /* integer overflow */
5491 PyErr_NoMemory();
5492 goto error;
5493 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005494 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005495 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005496 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005497 }
5498
5499 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005500 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005501 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005502 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005503 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005504 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5505 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005506 }
5507
5508 Py_CLEAR(rep);
5509 }
5510
5511 /* Cut back to size actually needed. This is necessary for, for example,
5512 encoding of a string containing isolated surrogates and the 'ignore'
5513 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005514 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005515 if (nsize != PyBytes_GET_SIZE(v))
5516 _PyBytes_Resize(&v, nsize);
5517 Py_XDECREF(errorHandler);
5518 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005519 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005520 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005521 error:
5522 Py_XDECREF(rep);
5523 Py_XDECREF(errorHandler);
5524 Py_XDECREF(exc);
5525 Py_XDECREF(v);
5526 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005527}
5528
Alexander Belopolsky40018472011-02-26 01:02:56 +00005529PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005530PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5531 Py_ssize_t size,
5532 const char *errors,
5533 int byteorder)
5534{
5535 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005536 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005537 if (tmp == NULL)
5538 return NULL;
5539 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5540 Py_DECREF(tmp);
5541 return result;
5542}
5543
5544PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005545PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005546{
Victor Stinnerb960b342011-11-20 19:12:52 +01005547 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005548}
5549
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550/* --- UTF-16 Codec ------------------------------------------------------- */
5551
Tim Peters772747b2001-08-09 22:21:55 +00005552PyObject *
5553PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005554 Py_ssize_t size,
5555 const char *errors,
5556 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557{
Walter Dörwald69652032004-09-07 20:24:22 +00005558 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5559}
5560
5561PyObject *
5562PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005563 Py_ssize_t size,
5564 const char *errors,
5565 int *byteorder,
5566 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005567{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005568 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005569 Py_ssize_t startinpos;
5570 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005571 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005572 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005573 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005574 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005575 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005576 PyObject *errorHandler = NULL;
5577 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005578 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579
Tim Peters772747b2001-08-09 22:21:55 +00005580 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005581 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582
5583 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005584 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005586 /* Check for BOM marks (U+FEFF) in the input and adjust current
5587 byte order setting accordingly. In native mode, the leading BOM
5588 mark is skipped, in all other modes, it is copied to the output
5589 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005590 if (bo == 0 && size >= 2) {
5591 const Py_UCS4 bom = (q[1] << 8) | q[0];
5592 if (bom == 0xFEFF) {
5593 q += 2;
5594 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005595 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005596 else if (bom == 0xFFFE) {
5597 q += 2;
5598 bo = 1;
5599 }
5600 if (byteorder)
5601 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603
Antoine Pitrou63065d72012-05-15 23:48:04 +02005604 if (q == e) {
5605 if (consumed)
5606 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005607 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005608 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005609
Christian Heimes743e0cd2012-10-17 23:52:17 +02005610#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005611 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005612 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005613#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005614 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005615 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005616#endif
Tim Peters772747b2001-08-09 22:21:55 +00005617
Antoine Pitrou63065d72012-05-15 23:48:04 +02005618 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang86fdad02018-01-31 20:48:05 +08005619 character count normally. Error handler will take care of
5620 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005621 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005622 writer.min_length = (e - q + 1) / 2;
5623 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005624 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005625
Antoine Pitrou63065d72012-05-15 23:48:04 +02005626 while (1) {
5627 Py_UCS4 ch = 0;
5628 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005629 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005630 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005631 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005632 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005633 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005634 native_ordering);
5635 else
5636 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005637 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005638 native_ordering);
5639 } else if (kind == PyUnicode_2BYTE_KIND) {
5640 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005641 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005642 native_ordering);
5643 } else {
5644 assert(kind == PyUnicode_4BYTE_KIND);
5645 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005646 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005647 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005648 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005649 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005650
Antoine Pitrou63065d72012-05-15 23:48:04 +02005651 switch (ch)
5652 {
5653 case 0:
5654 /* remaining byte at the end? (size should be even) */
5655 if (q == e || consumed)
5656 goto End;
5657 errmsg = "truncated data";
5658 startinpos = ((const char *)q) - starts;
5659 endinpos = ((const char *)e) - starts;
5660 break;
5661 /* The remaining input chars are ignored if the callback
5662 chooses to skip the input */
5663 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005664 q -= 2;
5665 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005666 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005667 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005668 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005669 endinpos = ((const char *)e) - starts;
5670 break;
5671 case 2:
5672 errmsg = "illegal encoding";
5673 startinpos = ((const char *)q) - 2 - starts;
5674 endinpos = startinpos + 2;
5675 break;
5676 case 3:
5677 errmsg = "illegal UTF-16 surrogate";
5678 startinpos = ((const char *)q) - 4 - starts;
5679 endinpos = startinpos + 2;
5680 break;
5681 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005682 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005683 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005684 continue;
5685 }
5686
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005687 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005688 errors,
5689 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005690 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005691 &starts,
5692 (const char **)&e,
5693 &startinpos,
5694 &endinpos,
5695 &exc,
5696 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005697 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005698 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 }
5700
Antoine Pitrou63065d72012-05-15 23:48:04 +02005701End:
Walter Dörwald69652032004-09-07 20:24:22 +00005702 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005703 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005704
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005705 Py_XDECREF(errorHandler);
5706 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005707 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708
Benjamin Peterson29060642009-01-31 22:14:21 +00005709 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005710 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005711 Py_XDECREF(errorHandler);
5712 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713 return NULL;
5714}
5715
Tim Peters772747b2001-08-09 22:21:55 +00005716PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005717_PyUnicode_EncodeUTF16(PyObject *str,
5718 const char *errors,
5719 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005721 enum PyUnicode_Kind kind;
5722 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005723 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005724 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005725 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005726 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005727#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005728 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005729#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005730 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005731#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005732 const char *encoding;
5733 Py_ssize_t nsize, pos;
5734 PyObject *errorHandler = NULL;
5735 PyObject *exc = NULL;
5736 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005737
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005738 if (!PyUnicode_Check(str)) {
5739 PyErr_BadArgument();
5740 return NULL;
5741 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005742 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005743 return NULL;
5744 kind = PyUnicode_KIND(str);
5745 data = PyUnicode_DATA(str);
5746 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005747
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005748 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005749 if (kind == PyUnicode_4BYTE_KIND) {
5750 const Py_UCS4 *in = (const Py_UCS4 *)data;
5751 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005752 while (in < end) {
5753 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005754 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005755 }
5756 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005757 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005758 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005759 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005760 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005761 nsize = len + pairs + (byteorder == 0);
5762 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005763 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005765 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005767 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005768 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005769 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005770 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005771 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005772 }
5773 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005774 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005775 }
Tim Peters772747b2001-08-09 22:21:55 +00005776
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005777 if (kind == PyUnicode_1BYTE_KIND) {
5778 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5779 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005780 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005781
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005782 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005783 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005784 }
5785 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005786 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005787 }
5788 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005789 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005790 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005791
5792 pos = 0;
5793 while (pos < len) {
5794 Py_ssize_t repsize, moreunits;
5795
5796 if (kind == PyUnicode_2BYTE_KIND) {
5797 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5798 &out, native_ordering);
5799 }
5800 else {
5801 assert(kind == PyUnicode_4BYTE_KIND);
5802 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5803 &out, native_ordering);
5804 }
5805 if (pos == len)
5806 break;
5807
5808 rep = unicode_encode_call_errorhandler(
5809 errors, &errorHandler,
5810 encoding, "surrogates not allowed",
5811 str, &exc, pos, pos + 1, &pos);
5812 if (!rep)
5813 goto error;
5814
5815 if (PyBytes_Check(rep)) {
5816 repsize = PyBytes_GET_SIZE(rep);
5817 if (repsize & 1) {
5818 raise_encode_exception(&exc, encoding,
5819 str, pos - 1, pos,
5820 "surrogates not allowed");
5821 goto error;
5822 }
5823 moreunits = repsize / 2;
5824 }
5825 else {
5826 assert(PyUnicode_Check(rep));
5827 if (PyUnicode_READY(rep) < 0)
5828 goto error;
5829 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5830 if (!PyUnicode_IS_ASCII(rep)) {
5831 raise_encode_exception(&exc, encoding,
5832 str, pos - 1, pos,
5833 "surrogates not allowed");
5834 goto error;
5835 }
5836 }
5837
5838 /* two bytes are reserved for each surrogate */
5839 if (moreunits > 1) {
5840 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005841 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005842 /* integer overflow */
5843 PyErr_NoMemory();
5844 goto error;
5845 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005846 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005847 goto error;
5848 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5849 }
5850
5851 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005852 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005853 out += moreunits;
5854 } else /* rep is unicode */ {
5855 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5856 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5857 &out, native_ordering);
5858 }
5859
5860 Py_CLEAR(rep);
5861 }
5862
5863 /* Cut back to size actually needed. This is necessary for, for example,
5864 encoding of a string containing isolated surrogates and the 'ignore' handler
5865 is used. */
5866 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5867 if (nsize != PyBytes_GET_SIZE(v))
5868 _PyBytes_Resize(&v, nsize);
5869 Py_XDECREF(errorHandler);
5870 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005871 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005872 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005873 error:
5874 Py_XDECREF(rep);
5875 Py_XDECREF(errorHandler);
5876 Py_XDECREF(exc);
5877 Py_XDECREF(v);
5878 return NULL;
5879#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880}
5881
Alexander Belopolsky40018472011-02-26 01:02:56 +00005882PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005883PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5884 Py_ssize_t size,
5885 const char *errors,
5886 int byteorder)
5887{
5888 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005889 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005890 if (tmp == NULL)
5891 return NULL;
5892 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5893 Py_DECREF(tmp);
5894 return result;
5895}
5896
5897PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005898PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005900 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901}
5902
5903/* --- Unicode Escape Codec ----------------------------------------------- */
5904
Fredrik Lundh06d12682001-01-24 07:59:11 +00005905static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005906
Alexander Belopolsky40018472011-02-26 01:02:56 +00005907PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005908_PyUnicode_DecodeUnicodeEscape(const char *s,
5909 Py_ssize_t size,
5910 const char *errors,
5911 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005913 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005914 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005916 PyObject *errorHandler = NULL;
5917 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005918
Eric V. Smith42454af2016-10-31 09:22:08 -04005919 // so we can remember if we've seen an invalid escape char or not
5920 *first_invalid_escape = NULL;
5921
Victor Stinner62ec3312016-09-06 17:04:34 -07005922 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005923 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005924 }
5925 /* Escaped strings will always be longer than the resulting
5926 Unicode string, so we start with size here and then reduce the
5927 length after conversion to the true value.
5928 (but if the error callback returns a long replacement string
5929 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005930 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005931 writer.min_length = size;
5932 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5933 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005934 }
5935
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 end = s + size;
5937 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005938 unsigned char c = (unsigned char) *s++;
5939 Py_UCS4 ch;
5940 int count;
5941 Py_ssize_t startinpos;
5942 Py_ssize_t endinpos;
5943 const char *message;
5944
5945#define WRITE_ASCII_CHAR(ch) \
5946 do { \
5947 assert(ch <= 127); \
5948 assert(writer.pos < writer.size); \
5949 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5950 } while(0)
5951
5952#define WRITE_CHAR(ch) \
5953 do { \
5954 if (ch <= writer.maxchar) { \
5955 assert(writer.pos < writer.size); \
5956 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5957 } \
5958 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5959 goto onError; \
5960 } \
5961 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962
5963 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005964 if (c != '\\') {
5965 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 continue;
5967 }
5968
Victor Stinner62ec3312016-09-06 17:04:34 -07005969 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005971 if (s >= end) {
5972 message = "\\ at end of string";
5973 goto error;
5974 }
5975 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005976
Victor Stinner62ec3312016-09-06 17:04:34 -07005977 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005978 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979
Benjamin Peterson29060642009-01-31 22:14:21 +00005980 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005981 case '\n': continue;
5982 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5983 case '\'': WRITE_ASCII_CHAR('\''); continue;
5984 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5985 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005986 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005987 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5988 case 't': WRITE_ASCII_CHAR('\t'); continue;
5989 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5990 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005991 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005992 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005993 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005994 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995
Benjamin Peterson29060642009-01-31 22:14:21 +00005996 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 case '0': case '1': case '2': case '3':
5998 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07005999 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006000 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006001 ch = (ch<<3) + *s++ - '0';
6002 if (s < end && '0' <= *s && *s <= '7') {
6003 ch = (ch<<3) + *s++ - '0';
6004 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006006 WRITE_CHAR(ch);
6007 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008
Benjamin Peterson29060642009-01-31 22:14:21 +00006009 /* hex escapes */
6010 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006012 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006013 message = "truncated \\xXX escape";
6014 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015
Benjamin Peterson29060642009-01-31 22:14:21 +00006016 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006018 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006019 message = "truncated \\uXXXX escape";
6020 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006023 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006024 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006025 message = "truncated \\UXXXXXXXX escape";
6026 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006027 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006028 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006029 ch <<= 4;
6030 if (c >= '0' && c <= '9') {
6031 ch += c - '0';
6032 }
6033 else if (c >= 'a' && c <= 'f') {
6034 ch += c - ('a' - 10);
6035 }
6036 else if (c >= 'A' && c <= 'F') {
6037 ch += c - ('A' - 10);
6038 }
6039 else {
6040 break;
6041 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006042 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006043 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006044 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006045 }
6046
6047 /* when we get here, ch is a 32-bit unicode character */
6048 if (ch > MAX_UNICODE) {
6049 message = "illegal Unicode character";
6050 goto error;
6051 }
6052
6053 WRITE_CHAR(ch);
6054 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006055
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006057 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006058 if (ucnhash_CAPI == NULL) {
6059 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006060 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6061 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006062 if (ucnhash_CAPI == NULL) {
6063 PyErr_SetString(
6064 PyExc_UnicodeError,
6065 "\\N escapes not supported (can't load unicodedata module)"
6066 );
6067 goto onError;
6068 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006069 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006070
6071 message = "malformed \\N character escape";
Miss Islington (bot)9fbcb142018-11-13 16:39:36 -08006072 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006073 const char *start = ++s;
6074 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006075 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006076 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006077 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006078 namelen = s - start;
6079 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006080 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006081 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006082 ch = 0xffffffff; /* in case 'getcode' messes up */
6083 if (namelen <= INT_MAX &&
6084 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6085 &ch, 0)) {
6086 assert(ch <= MAX_UNICODE);
6087 WRITE_CHAR(ch);
6088 continue;
6089 }
6090 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006091 }
6092 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006093 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006094
6095 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006096 if (*first_invalid_escape == NULL) {
6097 *first_invalid_escape = s-1; /* Back up one char, since we've
6098 already incremented s. */
6099 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006100 WRITE_ASCII_CHAR('\\');
6101 WRITE_CHAR(c);
6102 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006104
6105 error:
6106 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006107 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006108 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006109 errors, &errorHandler,
6110 "unicodeescape", message,
6111 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006112 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006113 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006114 }
Miss Islington (bot)09819ef2018-02-12 23:15:57 -08006115 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006116
6117#undef WRITE_ASCII_CHAR
6118#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006120
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006121 Py_XDECREF(errorHandler);
6122 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006123 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006124
Benjamin Peterson29060642009-01-31 22:14:21 +00006125 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006126 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006127 Py_XDECREF(errorHandler);
6128 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 return NULL;
6130}
6131
Eric V. Smith42454af2016-10-31 09:22:08 -04006132PyObject *
6133PyUnicode_DecodeUnicodeEscape(const char *s,
6134 Py_ssize_t size,
6135 const char *errors)
6136{
6137 const char *first_invalid_escape;
6138 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6139 &first_invalid_escape);
6140 if (result == NULL)
6141 return NULL;
6142 if (first_invalid_escape != NULL) {
6143 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6144 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006145 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006146 Py_DECREF(result);
6147 return NULL;
6148 }
6149 }
6150 return result;
6151}
6152
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006153/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154
Alexander Belopolsky40018472011-02-26 01:02:56 +00006155PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006156PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006158 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006159 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006161 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006162 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006163 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164
Ezio Melottie7f90372012-10-05 03:33:31 +03006165 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006166 escape.
6167
Ezio Melottie7f90372012-10-05 03:33:31 +03006168 For UCS1 strings it's '\xxx', 4 bytes per source character.
6169 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6170 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006171 */
6172
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006173 if (!PyUnicode_Check(unicode)) {
6174 PyErr_BadArgument();
6175 return NULL;
6176 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006177 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006178 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006179 }
Victor Stinner358af132015-10-12 22:36:57 +02006180
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006181 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006182 if (len == 0) {
6183 return PyBytes_FromStringAndSize(NULL, 0);
6184 }
6185
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006186 kind = PyUnicode_KIND(unicode);
6187 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006188 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6189 bytes, and 1 byte characters 4. */
6190 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006191 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006192 return PyErr_NoMemory();
6193 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006194 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006195 if (repr == NULL) {
6196 return NULL;
6197 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006198
Victor Stinner62ec3312016-09-06 17:04:34 -07006199 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006200 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006201 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006202
Victor Stinner62ec3312016-09-06 17:04:34 -07006203 /* U+0000-U+00ff range */
6204 if (ch < 0x100) {
6205 if (ch >= ' ' && ch < 127) {
6206 if (ch != '\\') {
6207 /* Copy printable US ASCII as-is */
6208 *p++ = (char) ch;
6209 }
6210 /* Escape backslashes */
6211 else {
6212 *p++ = '\\';
6213 *p++ = '\\';
6214 }
6215 }
Victor Stinner358af132015-10-12 22:36:57 +02006216
Victor Stinner62ec3312016-09-06 17:04:34 -07006217 /* Map special whitespace to '\t', \n', '\r' */
6218 else if (ch == '\t') {
6219 *p++ = '\\';
6220 *p++ = 't';
6221 }
6222 else if (ch == '\n') {
6223 *p++ = '\\';
6224 *p++ = 'n';
6225 }
6226 else if (ch == '\r') {
6227 *p++ = '\\';
6228 *p++ = 'r';
6229 }
6230
6231 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6232 else {
6233 *p++ = '\\';
6234 *p++ = 'x';
6235 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6236 *p++ = Py_hexdigits[ch & 0x000F];
6237 }
Tim Petersced69f82003-09-16 20:30:58 +00006238 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006239 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006240 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 *p++ = '\\';
6242 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006243 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6244 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6245 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6246 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006248 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6249 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006250
Victor Stinner62ec3312016-09-06 17:04:34 -07006251 /* Make sure that the first two digits are zero */
6252 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006253 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006254 *p++ = 'U';
6255 *p++ = '0';
6256 *p++ = '0';
6257 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6258 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6259 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6260 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6261 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6262 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006263 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265
Victor Stinner62ec3312016-09-06 17:04:34 -07006266 assert(p - PyBytes_AS_STRING(repr) > 0);
6267 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6268 return NULL;
6269 }
6270 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271}
6272
Alexander Belopolsky40018472011-02-26 01:02:56 +00006273PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006274PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6275 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006277 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006278 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006279 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006281 }
6282
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006283 result = PyUnicode_AsUnicodeEscapeString(tmp);
6284 Py_DECREF(tmp);
6285 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286}
6287
6288/* --- Raw Unicode Escape Codec ------------------------------------------- */
6289
Alexander Belopolsky40018472011-02-26 01:02:56 +00006290PyObject *
6291PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006292 Py_ssize_t size,
6293 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006295 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006296 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006298 PyObject *errorHandler = NULL;
6299 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006300
Victor Stinner62ec3312016-09-06 17:04:34 -07006301 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006302 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006303 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006304
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305 /* Escaped strings will always be longer than the resulting
6306 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006307 length after conversion to the true value. (But decoding error
6308 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006309 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006310 writer.min_length = size;
6311 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6312 goto onError;
6313 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006314
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315 end = s + size;
6316 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006317 unsigned char c = (unsigned char) *s++;
6318 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006319 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006320 Py_ssize_t startinpos;
6321 Py_ssize_t endinpos;
6322 const char *message;
6323
6324#define WRITE_CHAR(ch) \
6325 do { \
6326 if (ch <= writer.maxchar) { \
6327 assert(writer.pos < writer.size); \
6328 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6329 } \
6330 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6331 goto onError; \
6332 } \
6333 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334
Benjamin Peterson29060642009-01-31 22:14:21 +00006335 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006336 if (c != '\\' || s >= end) {
6337 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006338 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006339 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006340
Victor Stinner62ec3312016-09-06 17:04:34 -07006341 c = (unsigned char) *s++;
6342 if (c == 'u') {
6343 count = 4;
6344 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006345 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006346 else if (c == 'U') {
6347 count = 8;
6348 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006349 }
6350 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006351 assert(writer.pos < writer.size);
6352 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6353 WRITE_CHAR(c);
6354 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006355 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006356 startinpos = s - starts - 2;
6357
6358 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6359 for (ch = 0; count && s < end; ++s, --count) {
6360 c = (unsigned char)*s;
6361 ch <<= 4;
6362 if (c >= '0' && c <= '9') {
6363 ch += c - '0';
6364 }
6365 else if (c >= 'a' && c <= 'f') {
6366 ch += c - ('a' - 10);
6367 }
6368 else if (c >= 'A' && c <= 'F') {
6369 ch += c - ('A' - 10);
6370 }
6371 else {
6372 break;
6373 }
6374 }
6375 if (!count) {
6376 if (ch <= MAX_UNICODE) {
6377 WRITE_CHAR(ch);
6378 continue;
6379 }
6380 message = "\\Uxxxxxxxx out of range";
6381 }
6382
6383 endinpos = s-starts;
6384 writer.min_length = end - s + writer.pos;
6385 if (unicode_decode_call_errorhandler_writer(
6386 errors, &errorHandler,
6387 "rawunicodeescape", message,
6388 &starts, &end, &startinpos, &endinpos, &exc, &s,
6389 &writer)) {
6390 goto onError;
6391 }
Miss Islington (bot)09819ef2018-02-12 23:15:57 -08006392 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006393
6394#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006396 Py_XDECREF(errorHandler);
6397 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006398 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006399
Benjamin Peterson29060642009-01-31 22:14:21 +00006400 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006401 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006402 Py_XDECREF(errorHandler);
6403 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006405
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406}
6407
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006408
Alexander Belopolsky40018472011-02-26 01:02:56 +00006409PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006410PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411{
Victor Stinner62ec3312016-09-06 17:04:34 -07006412 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006414 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006415 int kind;
6416 void *data;
6417 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006419 if (!PyUnicode_Check(unicode)) {
6420 PyErr_BadArgument();
6421 return NULL;
6422 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006423 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006424 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006425 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006426 kind = PyUnicode_KIND(unicode);
6427 data = PyUnicode_DATA(unicode);
6428 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006429 if (kind == PyUnicode_1BYTE_KIND) {
6430 return PyBytes_FromStringAndSize(data, len);
6431 }
Victor Stinner0e368262011-11-10 20:12:49 +01006432
Victor Stinner62ec3312016-09-06 17:04:34 -07006433 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6434 bytes, and 1 byte characters 4. */
6435 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006436
Victor Stinner62ec3312016-09-06 17:04:34 -07006437 if (len > PY_SSIZE_T_MAX / expandsize) {
6438 return PyErr_NoMemory();
6439 }
6440 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6441 if (repr == NULL) {
6442 return NULL;
6443 }
6444 if (len == 0) {
6445 return repr;
6446 }
6447
6448 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006449 for (pos = 0; pos < len; pos++) {
6450 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006451
Victor Stinner62ec3312016-09-06 17:04:34 -07006452 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6453 if (ch < 0x100) {
6454 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006455 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006456 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6457 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458 *p++ = '\\';
6459 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006460 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6461 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6462 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6463 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006465 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6466 else {
6467 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6468 *p++ = '\\';
6469 *p++ = 'U';
6470 *p++ = '0';
6471 *p++ = '0';
6472 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6473 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6474 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6475 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6476 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6477 *p++ = Py_hexdigits[ch & 15];
6478 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006480
Victor Stinner62ec3312016-09-06 17:04:34 -07006481 assert(p > PyBytes_AS_STRING(repr));
6482 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6483 return NULL;
6484 }
6485 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486}
6487
Alexander Belopolsky40018472011-02-26 01:02:56 +00006488PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006489PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6490 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006492 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006493 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006494 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006495 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006496 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6497 Py_DECREF(tmp);
6498 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499}
6500
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006501/* --- Unicode Internal Codec ------------------------------------------- */
6502
Alexander Belopolsky40018472011-02-26 01:02:56 +00006503PyObject *
6504_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006505 Py_ssize_t size,
6506 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006507{
6508 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006509 Py_ssize_t startinpos;
6510 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006511 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006512 const char *end;
6513 const char *reason;
6514 PyObject *errorHandler = NULL;
6515 PyObject *exc = NULL;
6516
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006517 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006518 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006519 1))
6520 return NULL;
6521
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006522 if (size < 0) {
6523 PyErr_BadInternalCall();
6524 return NULL;
6525 }
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006526 if (size == 0)
6527 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006528
Victor Stinner8f674cc2013-04-17 23:02:17 +02006529 _PyUnicodeWriter_Init(&writer);
6530 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6531 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006532 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006533 }
6534 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006535
Victor Stinner8f674cc2013-04-17 23:02:17 +02006536 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006537 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006538 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006539 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006540 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006541 endinpos = end-starts;
6542 reason = "truncated input";
6543 goto error;
6544 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006545 /* We copy the raw representation one byte at a time because the
6546 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006547 ((char *) &uch)[0] = s[0];
6548 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006549#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006550 ((char *) &uch)[2] = s[2];
6551 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006552#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006553 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006554#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006555 /* We have to sanity check the raw data, otherwise doom looms for
6556 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006557 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006558 endinpos = s - starts + Py_UNICODE_SIZE;
6559 reason = "illegal code point (> 0x10FFFF)";
6560 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006561 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006562#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006563 s += Py_UNICODE_SIZE;
6564#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006565 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006566 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006567 Py_UNICODE uch2;
6568 ((char *) &uch2)[0] = s[0];
6569 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006570 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006571 {
Victor Stinner551ac952011-11-29 22:58:13 +01006572 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006573 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006574 }
6575 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006576#endif
6577
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006578 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006579 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006580 continue;
6581
6582 error:
6583 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006584 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006585 errors, &errorHandler,
6586 "unicode_internal", reason,
6587 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006588 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006589 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006590 }
6591
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006592 Py_XDECREF(errorHandler);
6593 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006594 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006595
Benjamin Peterson29060642009-01-31 22:14:21 +00006596 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006597 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006598 Py_XDECREF(errorHandler);
6599 Py_XDECREF(exc);
6600 return NULL;
6601}
6602
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603/* --- Latin-1 Codec ------------------------------------------------------ */
6604
Alexander Belopolsky40018472011-02-26 01:02:56 +00006605PyObject *
6606PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006607 Py_ssize_t size,
6608 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006611 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612}
6613
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006614/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006615static void
6616make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006617 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006618 PyObject *unicode,
6619 Py_ssize_t startpos, Py_ssize_t endpos,
6620 const char *reason)
6621{
6622 if (*exceptionObject == NULL) {
6623 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006624 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006625 encoding, unicode, startpos, endpos, reason);
6626 }
6627 else {
6628 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6629 goto onError;
6630 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6631 goto onError;
6632 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6633 goto onError;
6634 return;
6635 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006636 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006637 }
6638}
6639
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006640/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006641static void
6642raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006643 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006644 PyObject *unicode,
6645 Py_ssize_t startpos, Py_ssize_t endpos,
6646 const char *reason)
6647{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006648 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006649 encoding, unicode, startpos, endpos, reason);
6650 if (*exceptionObject != NULL)
6651 PyCodec_StrictErrors(*exceptionObject);
6652}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006653
6654/* error handling callback helper:
6655 build arguments, call the callback and check the arguments,
6656 put the result into newpos and return the replacement string, which
6657 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006658static PyObject *
6659unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006660 PyObject **errorHandler,
6661 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006662 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006663 Py_ssize_t startpos, Py_ssize_t endpos,
6664 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006665{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006666 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006667 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006668 PyObject *restuple;
6669 PyObject *resunicode;
6670
6671 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006673 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006674 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006675 }
6676
Benjamin Petersonbac79492012-01-14 13:34:47 -05006677 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006678 return NULL;
6679 len = PyUnicode_GET_LENGTH(unicode);
6680
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006681 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006682 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006683 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006684 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006685
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006686 restuple = PyObject_CallFunctionObjArgs(
6687 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006688 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006689 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006690 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006691 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006692 Py_DECREF(restuple);
6693 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006694 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006695 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006696 &resunicode, newpos)) {
6697 Py_DECREF(restuple);
6698 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006699 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006700 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6701 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6702 Py_DECREF(restuple);
6703 return NULL;
6704 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006705 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006706 *newpos = len + *newpos;
6707 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006708 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006709 Py_DECREF(restuple);
6710 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006711 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006712 Py_INCREF(resunicode);
6713 Py_DECREF(restuple);
6714 return resunicode;
6715}
6716
Alexander Belopolsky40018472011-02-26 01:02:56 +00006717static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006718unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006719 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006720 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006721{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006722 /* input state */
6723 Py_ssize_t pos=0, size;
6724 int kind;
6725 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006726 /* pointer into the output */
6727 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006728 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6729 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006730 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006731 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006732 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006733 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006734 /* output object */
6735 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006736
Benjamin Petersonbac79492012-01-14 13:34:47 -05006737 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006738 return NULL;
6739 size = PyUnicode_GET_LENGTH(unicode);
6740 kind = PyUnicode_KIND(unicode);
6741 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006742 /* allocate enough for a simple encoding without
6743 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006744 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006745 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006746
6747 _PyBytesWriter_Init(&writer);
6748 str = _PyBytesWriter_Alloc(&writer, size);
6749 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006750 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006751
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006752 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006753 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006754
Benjamin Peterson29060642009-01-31 22:14:21 +00006755 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006756 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006757 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006758 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006759 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006760 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006761 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006762 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006764 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006765 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006767
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006768 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006769 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006770
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006771 /* Only overallocate the buffer if it's not the last write */
6772 writer.overallocate = (collend < size);
6773
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006775 if (error_handler == _Py_ERROR_UNKNOWN)
6776 error_handler = get_error_handler(errors);
6777
6778 switch (error_handler) {
6779 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006780 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006782
6783 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006784 memset(str, '?', collend - collstart);
6785 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006786 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006787 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006788 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006789 break;
Victor Stinner50149202015-09-22 00:26:54 +02006790
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006791 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006792 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006793 writer.min_size -= (collend - collstart);
6794 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006795 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006796 if (str == NULL)
6797 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006798 pos = collend;
6799 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006800
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006801 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006802 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006803 writer.min_size -= (collend - collstart);
6804 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006805 unicode, collstart, collend);
6806 if (str == NULL)
6807 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006808 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006809 break;
Victor Stinner50149202015-09-22 00:26:54 +02006810
Victor Stinnerc3713e92015-09-29 12:32:13 +02006811 case _Py_ERROR_SURROGATEESCAPE:
6812 for (i = collstart; i < collend; ++i) {
6813 ch = PyUnicode_READ(kind, data, i);
6814 if (ch < 0xdc80 || 0xdcff < ch) {
6815 /* Not a UTF-8b surrogate */
6816 break;
6817 }
6818 *str++ = (char)(ch - 0xdc00);
6819 ++pos;
6820 }
6821 if (i >= collend)
6822 break;
6823 collstart = pos;
6824 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006825 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006826
Benjamin Peterson29060642009-01-31 22:14:21 +00006827 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006828 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6829 encoding, reason, unicode, &exc,
6830 collstart, collend, &newpos);
6831 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006832 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006833
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006834 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006835 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006836
Victor Stinner6bd525b2015-10-09 13:10:05 +02006837 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006838 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006839 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006840 PyBytes_AS_STRING(rep),
6841 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006842 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006843 else {
6844 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006845
Victor Stinner6bd525b2015-10-09 13:10:05 +02006846 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006847 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006848
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006849 if (limit == 256 ?
6850 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6851 !PyUnicode_IS_ASCII(rep))
6852 {
6853 /* Not all characters are smaller than limit */
6854 raise_encode_exception(&exc, encoding, unicode,
6855 collstart, collend, reason);
6856 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006857 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006858 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6859 str = _PyBytesWriter_WriteBytes(&writer, str,
6860 PyUnicode_DATA(rep),
6861 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006862 }
Miss Islington (bot)1e596d32018-08-19 16:17:53 -04006863 if (str == NULL)
6864 goto onError;
6865
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006866 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006867 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006868 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006869
6870 /* If overallocation was disabled, ensure that it was the last
6871 write. Otherwise, we missed an optimization */
6872 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006873 }
6874 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006875
Victor Stinner50149202015-09-22 00:26:54 +02006876 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006877 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006878 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006879
6880 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006881 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006882 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006883 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006884 Py_XDECREF(exc);
6885 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006886}
6887
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006888/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006889PyObject *
6890PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006891 Py_ssize_t size,
6892 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006894 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006895 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006896 if (unicode == NULL)
6897 return NULL;
6898 result = unicode_encode_ucs1(unicode, errors, 256);
6899 Py_DECREF(unicode);
6900 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901}
6902
Alexander Belopolsky40018472011-02-26 01:02:56 +00006903PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006904_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905{
6906 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006907 PyErr_BadArgument();
6908 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006910 if (PyUnicode_READY(unicode) == -1)
6911 return NULL;
6912 /* Fast path: if it is a one-byte string, construct
6913 bytes object directly. */
6914 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6915 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6916 PyUnicode_GET_LENGTH(unicode));
6917 /* Non-Latin-1 characters present. Defer to above function to
6918 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006919 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006920}
6921
6922PyObject*
6923PyUnicode_AsLatin1String(PyObject *unicode)
6924{
6925 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926}
6927
6928/* --- 7-bit ASCII Codec -------------------------------------------------- */
6929
Alexander Belopolsky40018472011-02-26 01:02:56 +00006930PyObject *
6931PyUnicode_DecodeASCII(const char *s,
6932 Py_ssize_t size,
6933 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006935 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006936 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006937 int kind;
6938 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006939 Py_ssize_t startinpos;
6940 Py_ssize_t endinpos;
6941 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006942 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006943 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006944 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006945 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006946
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006948 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006949
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006951 if (size == 1 && (unsigned char)s[0] < 128)
6952 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006953
Victor Stinner8f674cc2013-04-17 23:02:17 +02006954 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006955 writer.min_length = size;
6956 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006957 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006958
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006959 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006960 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006961 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006962 writer.pos = outpos;
6963 if (writer.pos == size)
6964 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006965
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006966 s += writer.pos;
6967 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006968 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006969 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006970 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006971 PyUnicode_WRITE(kind, data, writer.pos, c);
6972 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006974 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006975 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006976
6977 /* byte outsize range 0x00..0x7f: call the error handler */
6978
6979 if (error_handler == _Py_ERROR_UNKNOWN)
6980 error_handler = get_error_handler(errors);
6981
6982 switch (error_handler)
6983 {
6984 case _Py_ERROR_REPLACE:
6985 case _Py_ERROR_SURROGATEESCAPE:
6986 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006987 but we may switch to UCS2 at the first write */
6988 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6989 goto onError;
6990 kind = writer.kind;
6991 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006992
6993 if (error_handler == _Py_ERROR_REPLACE)
6994 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6995 else
6996 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6997 writer.pos++;
6998 ++s;
6999 break;
7000
7001 case _Py_ERROR_IGNORE:
7002 ++s;
7003 break;
7004
7005 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007006 startinpos = s-starts;
7007 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007008 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007009 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007010 "ascii", "ordinal not in range(128)",
7011 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007012 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007013 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007014 kind = writer.kind;
7015 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007016 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007018 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007019 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007020 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007021
Benjamin Peterson29060642009-01-31 22:14:21 +00007022 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007023 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007024 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007025 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026 return NULL;
7027}
7028
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007029/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007030PyObject *
7031PyUnicode_EncodeASCII(const Py_UNICODE *p,
7032 Py_ssize_t size,
7033 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007035 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007036 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007037 if (unicode == NULL)
7038 return NULL;
7039 result = unicode_encode_ucs1(unicode, errors, 128);
7040 Py_DECREF(unicode);
7041 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042}
7043
Alexander Belopolsky40018472011-02-26 01:02:56 +00007044PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007045_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046{
7047 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007048 PyErr_BadArgument();
7049 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007051 if (PyUnicode_READY(unicode) == -1)
7052 return NULL;
7053 /* Fast path: if it is an ASCII-only string, construct bytes object
7054 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007055 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007056 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7057 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007058 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007059}
7060
7061PyObject *
7062PyUnicode_AsASCIIString(PyObject *unicode)
7063{
7064 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065}
7066
Steve Dowercc16be82016-09-08 10:35:16 -07007067#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007068
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007069/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007070
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007071#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007072#define NEED_RETRY
7073#endif
7074
Victor Stinner3a50e702011-10-18 21:21:00 +02007075#ifndef WC_ERR_INVALID_CHARS
7076# define WC_ERR_INVALID_CHARS 0x0080
7077#endif
7078
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007079static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007080code_page_name(UINT code_page, PyObject **obj)
7081{
7082 *obj = NULL;
7083 if (code_page == CP_ACP)
7084 return "mbcs";
7085 if (code_page == CP_UTF7)
7086 return "CP_UTF7";
7087 if (code_page == CP_UTF8)
7088 return "CP_UTF8";
7089
7090 *obj = PyBytes_FromFormat("cp%u", code_page);
7091 if (*obj == NULL)
7092 return NULL;
7093 return PyBytes_AS_STRING(*obj);
7094}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007095
Victor Stinner3a50e702011-10-18 21:21:00 +02007096static DWORD
7097decode_code_page_flags(UINT code_page)
7098{
7099 if (code_page == CP_UTF7) {
7100 /* The CP_UTF7 decoder only supports flags=0 */
7101 return 0;
7102 }
7103 else
7104 return MB_ERR_INVALID_CHARS;
7105}
7106
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007107/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007108 * Decode a byte string from a Windows code page into unicode object in strict
7109 * mode.
7110 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007111 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7112 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007113 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007114static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007115decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007116 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007117 const char *in,
7118 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007119{
Victor Stinner3a50e702011-10-18 21:21:00 +02007120 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007121 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007122 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007123
7124 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007125 assert(insize > 0);
7126 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7127 if (outsize <= 0)
7128 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007129
7130 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007131 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007132 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007133 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007134 if (*v == NULL)
7135 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007136 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007137 }
7138 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007139 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007140 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007141 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007142 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007143 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007144 }
7145
7146 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007147 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7148 if (outsize <= 0)
7149 goto error;
7150 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007151
Victor Stinner3a50e702011-10-18 21:21:00 +02007152error:
7153 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7154 return -2;
7155 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007156 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007157}
7158
Victor Stinner3a50e702011-10-18 21:21:00 +02007159/*
7160 * Decode a byte string from a code page into unicode object with an error
7161 * handler.
7162 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007163 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007164 * UnicodeDecodeError exception and returns -1 on error.
7165 */
7166static int
7167decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007168 PyObject **v,
7169 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007170 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007171{
7172 const char *startin = in;
7173 const char *endin = in + size;
7174 const DWORD flags = decode_code_page_flags(code_page);
7175 /* Ideally, we should get reason from FormatMessage. This is the Windows
7176 2000 English version of the message. */
7177 const char *reason = "No mapping for the Unicode character exists "
7178 "in the target code page.";
7179 /* each step cannot decode more than 1 character, but a character can be
7180 represented as a surrogate pair */
7181 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007182 int insize;
7183 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007184 PyObject *errorHandler = NULL;
7185 PyObject *exc = NULL;
7186 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007187 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007188 DWORD err;
7189 int ret = -1;
7190
7191 assert(size > 0);
7192
7193 encoding = code_page_name(code_page, &encoding_obj);
7194 if (encoding == NULL)
7195 return -1;
7196
Victor Stinner7d00cc12014-03-17 23:08:06 +01007197 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007198 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7199 UnicodeDecodeError. */
7200 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7201 if (exc != NULL) {
7202 PyCodec_StrictErrors(exc);
7203 Py_CLEAR(exc);
7204 }
7205 goto error;
7206 }
7207
7208 if (*v == NULL) {
7209 /* Create unicode object */
7210 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7211 PyErr_NoMemory();
7212 goto error;
7213 }
Victor Stinnerab595942011-12-17 04:59:06 +01007214 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007215 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007216 if (*v == NULL)
7217 goto error;
7218 startout = PyUnicode_AS_UNICODE(*v);
7219 }
7220 else {
7221 /* Extend unicode object */
7222 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7223 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7224 PyErr_NoMemory();
7225 goto error;
7226 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007227 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007228 goto error;
7229 startout = PyUnicode_AS_UNICODE(*v) + n;
7230 }
7231
7232 /* Decode the byte string character per character */
7233 out = startout;
7234 while (in < endin)
7235 {
7236 /* Decode a character */
7237 insize = 1;
7238 do
7239 {
7240 outsize = MultiByteToWideChar(code_page, flags,
7241 in, insize,
7242 buffer, Py_ARRAY_LENGTH(buffer));
7243 if (outsize > 0)
7244 break;
7245 err = GetLastError();
7246 if (err != ERROR_NO_UNICODE_TRANSLATION
7247 && err != ERROR_INSUFFICIENT_BUFFER)
7248 {
7249 PyErr_SetFromWindowsErr(0);
7250 goto error;
7251 }
7252 insize++;
7253 }
7254 /* 4=maximum length of a UTF-8 sequence */
7255 while (insize <= 4 && (in + insize) <= endin);
7256
7257 if (outsize <= 0) {
7258 Py_ssize_t startinpos, endinpos, outpos;
7259
Victor Stinner7d00cc12014-03-17 23:08:06 +01007260 /* last character in partial decode? */
7261 if (in + insize >= endin && !final)
7262 break;
7263
Victor Stinner3a50e702011-10-18 21:21:00 +02007264 startinpos = in - startin;
7265 endinpos = startinpos + 1;
7266 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007267 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007268 errors, &errorHandler,
7269 encoding, reason,
7270 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007271 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007272 {
7273 goto error;
7274 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007275 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007276 }
7277 else {
7278 in += insize;
7279 memcpy(out, buffer, outsize * sizeof(wchar_t));
7280 out += outsize;
7281 }
7282 }
7283
7284 /* write a NUL character at the end */
7285 *out = 0;
7286
7287 /* Extend unicode object */
7288 outsize = out - startout;
7289 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007290 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007291 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007292 /* (in - startin) <= size and size is an int */
7293 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007294
7295error:
7296 Py_XDECREF(encoding_obj);
7297 Py_XDECREF(errorHandler);
7298 Py_XDECREF(exc);
7299 return ret;
7300}
7301
Victor Stinner3a50e702011-10-18 21:21:00 +02007302static PyObject *
7303decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007304 const char *s, Py_ssize_t size,
7305 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007306{
Victor Stinner76a31a62011-11-04 00:05:13 +01007307 PyObject *v = NULL;
7308 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007309
Victor Stinner3a50e702011-10-18 21:21:00 +02007310 if (code_page < 0) {
7311 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7312 return NULL;
7313 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007314 if (size < 0) {
7315 PyErr_BadInternalCall();
7316 return NULL;
7317 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007318
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007319 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007320 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007321
Victor Stinner76a31a62011-11-04 00:05:13 +01007322 do
7323 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007324#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007325 if (size > INT_MAX) {
7326 chunk_size = INT_MAX;
7327 final = 0;
7328 done = 0;
7329 }
7330 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007331#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007332 {
7333 chunk_size = (int)size;
7334 final = (consumed == NULL);
7335 done = 1;
7336 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007337
Victor Stinner76a31a62011-11-04 00:05:13 +01007338 if (chunk_size == 0 && done) {
7339 if (v != NULL)
7340 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007341 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007342 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007343
Victor Stinner76a31a62011-11-04 00:05:13 +01007344 converted = decode_code_page_strict(code_page, &v,
7345 s, chunk_size);
7346 if (converted == -2)
7347 converted = decode_code_page_errors(code_page, &v,
7348 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007349 errors, final);
7350 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007351
7352 if (converted < 0) {
7353 Py_XDECREF(v);
7354 return NULL;
7355 }
7356
7357 if (consumed)
7358 *consumed += converted;
7359
7360 s += converted;
7361 size -= converted;
7362 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007363
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007364 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007365}
7366
Alexander Belopolsky40018472011-02-26 01:02:56 +00007367PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007368PyUnicode_DecodeCodePageStateful(int code_page,
7369 const char *s,
7370 Py_ssize_t size,
7371 const char *errors,
7372 Py_ssize_t *consumed)
7373{
7374 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7375}
7376
7377PyObject *
7378PyUnicode_DecodeMBCSStateful(const char *s,
7379 Py_ssize_t size,
7380 const char *errors,
7381 Py_ssize_t *consumed)
7382{
7383 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7384}
7385
7386PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007387PyUnicode_DecodeMBCS(const char *s,
7388 Py_ssize_t size,
7389 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007390{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007391 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7392}
7393
Victor Stinner3a50e702011-10-18 21:21:00 +02007394static DWORD
7395encode_code_page_flags(UINT code_page, const char *errors)
7396{
7397 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007398 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007399 }
7400 else if (code_page == CP_UTF7) {
7401 /* CP_UTF7 only supports flags=0 */
7402 return 0;
7403 }
7404 else {
7405 if (errors != NULL && strcmp(errors, "replace") == 0)
7406 return 0;
7407 else
7408 return WC_NO_BEST_FIT_CHARS;
7409 }
7410}
7411
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007412/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007413 * Encode a Unicode string to a Windows code page into a byte string in strict
7414 * mode.
7415 *
7416 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007417 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007418 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007419static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007420encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007421 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007422 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007423{
Victor Stinner554f3f02010-06-16 23:33:54 +00007424 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007425 BOOL *pusedDefaultChar = &usedDefaultChar;
7426 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007427 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007428 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007429 const DWORD flags = encode_code_page_flags(code_page, NULL);
7430 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007431 /* Create a substring so that we can get the UTF-16 representation
7432 of just the slice under consideration. */
7433 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007434
Martin v. Löwis3d325192011-11-04 18:23:06 +01007435 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007436
Victor Stinner3a50e702011-10-18 21:21:00 +02007437 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007438 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007439 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007440 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007441
Victor Stinner2fc507f2011-11-04 20:06:39 +01007442 substring = PyUnicode_Substring(unicode, offset, offset+len);
7443 if (substring == NULL)
7444 return -1;
7445 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7446 if (p == NULL) {
7447 Py_DECREF(substring);
7448 return -1;
7449 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007450 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007451
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007452 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007453 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007454 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007455 NULL, 0,
7456 NULL, pusedDefaultChar);
7457 if (outsize <= 0)
7458 goto error;
7459 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007460 if (pusedDefaultChar && *pusedDefaultChar) {
7461 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007462 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007463 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007464
Victor Stinner3a50e702011-10-18 21:21:00 +02007465 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007466 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007467 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007468 if (*outbytes == NULL) {
7469 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007470 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007471 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007472 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007473 }
7474 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007475 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007476 const Py_ssize_t n = PyBytes_Size(*outbytes);
7477 if (outsize > PY_SSIZE_T_MAX - n) {
7478 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007479 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007480 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007481 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007482 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7483 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007484 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007485 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007486 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007487 }
7488
7489 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007490 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007491 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007492 out, outsize,
7493 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007494 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007495 if (outsize <= 0)
7496 goto error;
7497 if (pusedDefaultChar && *pusedDefaultChar)
7498 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007499 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007500
Victor Stinner3a50e702011-10-18 21:21:00 +02007501error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007502 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007503 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7504 return -2;
7505 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007506 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007507}
7508
Victor Stinner3a50e702011-10-18 21:21:00 +02007509/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007510 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007511 * error handler.
7512 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007513 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007514 * -1 on other error.
7515 */
7516static int
7517encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007518 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007519 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007520{
Victor Stinner3a50e702011-10-18 21:21:00 +02007521 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007522 Py_ssize_t pos = unicode_offset;
7523 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007524 /* Ideally, we should get reason from FormatMessage. This is the Windows
7525 2000 English version of the message. */
7526 const char *reason = "invalid character";
7527 /* 4=maximum length of a UTF-8 sequence */
7528 char buffer[4];
7529 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7530 Py_ssize_t outsize;
7531 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007532 PyObject *errorHandler = NULL;
7533 PyObject *exc = NULL;
7534 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007535 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007536 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007537 PyObject *rep;
7538 int ret = -1;
7539
7540 assert(insize > 0);
7541
7542 encoding = code_page_name(code_page, &encoding_obj);
7543 if (encoding == NULL)
7544 return -1;
7545
7546 if (errors == NULL || strcmp(errors, "strict") == 0) {
7547 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7548 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007549 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007550 if (exc != NULL) {
7551 PyCodec_StrictErrors(exc);
7552 Py_DECREF(exc);
7553 }
7554 Py_XDECREF(encoding_obj);
7555 return -1;
7556 }
7557
7558 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7559 pusedDefaultChar = &usedDefaultChar;
7560 else
7561 pusedDefaultChar = NULL;
7562
7563 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7564 PyErr_NoMemory();
7565 goto error;
7566 }
7567 outsize = insize * Py_ARRAY_LENGTH(buffer);
7568
7569 if (*outbytes == NULL) {
7570 /* Create string object */
7571 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7572 if (*outbytes == NULL)
7573 goto error;
7574 out = PyBytes_AS_STRING(*outbytes);
7575 }
7576 else {
7577 /* Extend string object */
7578 Py_ssize_t n = PyBytes_Size(*outbytes);
7579 if (n > PY_SSIZE_T_MAX - outsize) {
7580 PyErr_NoMemory();
7581 goto error;
7582 }
7583 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7584 goto error;
7585 out = PyBytes_AS_STRING(*outbytes) + n;
7586 }
7587
7588 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007589 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007590 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007591 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7592 wchar_t chars[2];
7593 int charsize;
7594 if (ch < 0x10000) {
7595 chars[0] = (wchar_t)ch;
7596 charsize = 1;
7597 }
7598 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007599 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7600 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007601 charsize = 2;
7602 }
7603
Victor Stinner3a50e702011-10-18 21:21:00 +02007604 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007605 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007606 buffer, Py_ARRAY_LENGTH(buffer),
7607 NULL, pusedDefaultChar);
7608 if (outsize > 0) {
7609 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7610 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007611 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007612 memcpy(out, buffer, outsize);
7613 out += outsize;
7614 continue;
7615 }
7616 }
7617 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7618 PyErr_SetFromWindowsErr(0);
7619 goto error;
7620 }
7621
Victor Stinner3a50e702011-10-18 21:21:00 +02007622 rep = unicode_encode_call_errorhandler(
7623 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007624 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007625 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007626 if (rep == NULL)
7627 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007628 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007629
7630 if (PyBytes_Check(rep)) {
7631 outsize = PyBytes_GET_SIZE(rep);
7632 if (outsize != 1) {
7633 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7634 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7635 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7636 Py_DECREF(rep);
7637 goto error;
7638 }
7639 out = PyBytes_AS_STRING(*outbytes) + offset;
7640 }
7641 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7642 out += outsize;
7643 }
7644 else {
7645 Py_ssize_t i;
7646 enum PyUnicode_Kind kind;
7647 void *data;
7648
Benjamin Petersonbac79492012-01-14 13:34:47 -05007649 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007650 Py_DECREF(rep);
7651 goto error;
7652 }
7653
7654 outsize = PyUnicode_GET_LENGTH(rep);
7655 if (outsize != 1) {
7656 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7657 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7658 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7659 Py_DECREF(rep);
7660 goto error;
7661 }
7662 out = PyBytes_AS_STRING(*outbytes) + offset;
7663 }
7664 kind = PyUnicode_KIND(rep);
7665 data = PyUnicode_DATA(rep);
7666 for (i=0; i < outsize; i++) {
7667 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7668 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007669 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007670 encoding, unicode,
7671 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007672 "unable to encode error handler result to ASCII");
7673 Py_DECREF(rep);
7674 goto error;
7675 }
7676 *out = (unsigned char)ch;
7677 out++;
7678 }
7679 }
7680 Py_DECREF(rep);
7681 }
7682 /* write a NUL byte */
7683 *out = 0;
7684 outsize = out - PyBytes_AS_STRING(*outbytes);
7685 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7686 if (_PyBytes_Resize(outbytes, outsize) < 0)
7687 goto error;
7688 ret = 0;
7689
7690error:
7691 Py_XDECREF(encoding_obj);
7692 Py_XDECREF(errorHandler);
7693 Py_XDECREF(exc);
7694 return ret;
7695}
7696
Victor Stinner3a50e702011-10-18 21:21:00 +02007697static PyObject *
7698encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007699 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007700 const char *errors)
7701{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007702 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007703 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007704 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007705 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007706
Victor Stinner29dacf22015-01-26 16:41:32 +01007707 if (!PyUnicode_Check(unicode)) {
7708 PyErr_BadArgument();
7709 return NULL;
7710 }
7711
Benjamin Petersonbac79492012-01-14 13:34:47 -05007712 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007713 return NULL;
7714 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007715
Victor Stinner3a50e702011-10-18 21:21:00 +02007716 if (code_page < 0) {
7717 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7718 return NULL;
7719 }
7720
Martin v. Löwis3d325192011-11-04 18:23:06 +01007721 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007722 return PyBytes_FromStringAndSize(NULL, 0);
7723
Victor Stinner7581cef2011-11-03 22:32:33 +01007724 offset = 0;
7725 do
7726 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007727#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007728 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007729 chunks. */
7730 if (len > INT_MAX/2) {
7731 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007732 done = 0;
7733 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007734 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007735#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007736 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007737 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007738 done = 1;
7739 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007740
Victor Stinner76a31a62011-11-04 00:05:13 +01007741 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007742 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007743 errors);
7744 if (ret == -2)
7745 ret = encode_code_page_errors(code_page, &outbytes,
7746 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007747 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007748 if (ret < 0) {
7749 Py_XDECREF(outbytes);
7750 return NULL;
7751 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007752
Victor Stinner7581cef2011-11-03 22:32:33 +01007753 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007754 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007755 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007756
Victor Stinner3a50e702011-10-18 21:21:00 +02007757 return outbytes;
7758}
7759
7760PyObject *
7761PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7762 Py_ssize_t size,
7763 const char *errors)
7764{
Victor Stinner7581cef2011-11-03 22:32:33 +01007765 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007766 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007767 if (unicode == NULL)
7768 return NULL;
7769 res = encode_code_page(CP_ACP, unicode, errors);
7770 Py_DECREF(unicode);
7771 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007772}
7773
7774PyObject *
7775PyUnicode_EncodeCodePage(int code_page,
7776 PyObject *unicode,
7777 const char *errors)
7778{
Victor Stinner7581cef2011-11-03 22:32:33 +01007779 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007780}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007781
Alexander Belopolsky40018472011-02-26 01:02:56 +00007782PyObject *
7783PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007784{
Victor Stinner7581cef2011-11-03 22:32:33 +01007785 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007786}
7787
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007788#undef NEED_RETRY
7789
Steve Dowercc16be82016-09-08 10:35:16 -07007790#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007791
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792/* --- Character Mapping Codec -------------------------------------------- */
7793
Victor Stinnerfb161b12013-04-18 01:44:27 +02007794static int
7795charmap_decode_string(const char *s,
7796 Py_ssize_t size,
7797 PyObject *mapping,
7798 const char *errors,
7799 _PyUnicodeWriter *writer)
7800{
7801 const char *starts = s;
7802 const char *e;
7803 Py_ssize_t startinpos, endinpos;
7804 PyObject *errorHandler = NULL, *exc = NULL;
7805 Py_ssize_t maplen;
7806 enum PyUnicode_Kind mapkind;
7807 void *mapdata;
7808 Py_UCS4 x;
7809 unsigned char ch;
7810
7811 if (PyUnicode_READY(mapping) == -1)
7812 return -1;
7813
7814 maplen = PyUnicode_GET_LENGTH(mapping);
7815 mapdata = PyUnicode_DATA(mapping);
7816 mapkind = PyUnicode_KIND(mapping);
7817
7818 e = s + size;
7819
7820 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7821 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7822 * is disabled in encoding aliases, latin1 is preferred because
7823 * its implementation is faster. */
7824 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7825 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7826 Py_UCS4 maxchar = writer->maxchar;
7827
7828 assert (writer->kind == PyUnicode_1BYTE_KIND);
7829 while (s < e) {
7830 ch = *s;
7831 x = mapdata_ucs1[ch];
7832 if (x > maxchar) {
7833 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7834 goto onError;
7835 maxchar = writer->maxchar;
7836 outdata = (Py_UCS1 *)writer->data;
7837 }
7838 outdata[writer->pos] = x;
7839 writer->pos++;
7840 ++s;
7841 }
7842 return 0;
7843 }
7844
7845 while (s < e) {
7846 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7847 enum PyUnicode_Kind outkind = writer->kind;
7848 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7849 if (outkind == PyUnicode_1BYTE_KIND) {
7850 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7851 Py_UCS4 maxchar = writer->maxchar;
7852 while (s < e) {
7853 ch = *s;
7854 x = mapdata_ucs2[ch];
7855 if (x > maxchar)
7856 goto Error;
7857 outdata[writer->pos] = x;
7858 writer->pos++;
7859 ++s;
7860 }
7861 break;
7862 }
7863 else if (outkind == PyUnicode_2BYTE_KIND) {
7864 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7865 while (s < e) {
7866 ch = *s;
7867 x = mapdata_ucs2[ch];
7868 if (x == 0xFFFE)
7869 goto Error;
7870 outdata[writer->pos] = x;
7871 writer->pos++;
7872 ++s;
7873 }
7874 break;
7875 }
7876 }
7877 ch = *s;
7878
7879 if (ch < maplen)
7880 x = PyUnicode_READ(mapkind, mapdata, ch);
7881 else
7882 x = 0xfffe; /* invalid value */
7883Error:
7884 if (x == 0xfffe)
7885 {
7886 /* undefined mapping */
7887 startinpos = s-starts;
7888 endinpos = startinpos+1;
7889 if (unicode_decode_call_errorhandler_writer(
7890 errors, &errorHandler,
7891 "charmap", "character maps to <undefined>",
7892 &starts, &e, &startinpos, &endinpos, &exc, &s,
7893 writer)) {
7894 goto onError;
7895 }
7896 continue;
7897 }
7898
7899 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7900 goto onError;
7901 ++s;
7902 }
7903 Py_XDECREF(errorHandler);
7904 Py_XDECREF(exc);
7905 return 0;
7906
7907onError:
7908 Py_XDECREF(errorHandler);
7909 Py_XDECREF(exc);
7910 return -1;
7911}
7912
7913static int
7914charmap_decode_mapping(const char *s,
7915 Py_ssize_t size,
7916 PyObject *mapping,
7917 const char *errors,
7918 _PyUnicodeWriter *writer)
7919{
7920 const char *starts = s;
7921 const char *e;
7922 Py_ssize_t startinpos, endinpos;
7923 PyObject *errorHandler = NULL, *exc = NULL;
7924 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007925 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007926
7927 e = s + size;
7928
7929 while (s < e) {
7930 ch = *s;
7931
7932 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7933 key = PyLong_FromLong((long)ch);
7934 if (key == NULL)
7935 goto onError;
7936
7937 item = PyObject_GetItem(mapping, key);
7938 Py_DECREF(key);
7939 if (item == NULL) {
7940 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7941 /* No mapping found means: mapping is undefined. */
7942 PyErr_Clear();
7943 goto Undefined;
7944 } else
7945 goto onError;
7946 }
7947
7948 /* Apply mapping */
7949 if (item == Py_None)
7950 goto Undefined;
7951 if (PyLong_Check(item)) {
7952 long value = PyLong_AS_LONG(item);
7953 if (value == 0xFFFE)
7954 goto Undefined;
7955 if (value < 0 || value > MAX_UNICODE) {
7956 PyErr_Format(PyExc_TypeError,
7957 "character mapping must be in range(0x%lx)",
7958 (unsigned long)MAX_UNICODE + 1);
7959 goto onError;
7960 }
7961
7962 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7963 goto onError;
7964 }
7965 else if (PyUnicode_Check(item)) {
7966 if (PyUnicode_READY(item) == -1)
7967 goto onError;
7968 if (PyUnicode_GET_LENGTH(item) == 1) {
7969 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7970 if (value == 0xFFFE)
7971 goto Undefined;
7972 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7973 goto onError;
7974 }
7975 else {
7976 writer->overallocate = 1;
7977 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7978 goto onError;
7979 }
7980 }
7981 else {
7982 /* wrong return value */
7983 PyErr_SetString(PyExc_TypeError,
7984 "character mapping must return integer, None or str");
7985 goto onError;
7986 }
7987 Py_CLEAR(item);
7988 ++s;
7989 continue;
7990
7991Undefined:
7992 /* undefined mapping */
7993 Py_CLEAR(item);
7994 startinpos = s-starts;
7995 endinpos = startinpos+1;
7996 if (unicode_decode_call_errorhandler_writer(
7997 errors, &errorHandler,
7998 "charmap", "character maps to <undefined>",
7999 &starts, &e, &startinpos, &endinpos, &exc, &s,
8000 writer)) {
8001 goto onError;
8002 }
8003 }
8004 Py_XDECREF(errorHandler);
8005 Py_XDECREF(exc);
8006 return 0;
8007
8008onError:
8009 Py_XDECREF(item);
8010 Py_XDECREF(errorHandler);
8011 Py_XDECREF(exc);
8012 return -1;
8013}
8014
Alexander Belopolsky40018472011-02-26 01:02:56 +00008015PyObject *
8016PyUnicode_DecodeCharmap(const char *s,
8017 Py_ssize_t size,
8018 PyObject *mapping,
8019 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008021 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008022
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023 /* Default to Latin-1 */
8024 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008025 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008028 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008029 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008030 writer.min_length = size;
8031 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008033
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008034 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008035 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8036 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008037 }
8038 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008039 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8040 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008042 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008043
Benjamin Peterson29060642009-01-31 22:14:21 +00008044 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008045 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046 return NULL;
8047}
8048
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008049/* Charmap encoding: the lookup table */
8050
Alexander Belopolsky40018472011-02-26 01:02:56 +00008051struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008052 PyObject_HEAD
8053 unsigned char level1[32];
8054 int count2, count3;
8055 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008056};
8057
8058static PyObject*
8059encoding_map_size(PyObject *obj, PyObject* args)
8060{
8061 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008062 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008063 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008064}
8065
8066static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008067 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008068 PyDoc_STR("Return the size (in bytes) of this object") },
8069 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008070};
8071
8072static void
8073encoding_map_dealloc(PyObject* o)
8074{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008075 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008076}
8077
8078static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008079 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008080 "EncodingMap", /*tp_name*/
8081 sizeof(struct encoding_map), /*tp_basicsize*/
8082 0, /*tp_itemsize*/
8083 /* methods */
8084 encoding_map_dealloc, /*tp_dealloc*/
8085 0, /*tp_print*/
8086 0, /*tp_getattr*/
8087 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008088 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008089 0, /*tp_repr*/
8090 0, /*tp_as_number*/
8091 0, /*tp_as_sequence*/
8092 0, /*tp_as_mapping*/
8093 0, /*tp_hash*/
8094 0, /*tp_call*/
8095 0, /*tp_str*/
8096 0, /*tp_getattro*/
8097 0, /*tp_setattro*/
8098 0, /*tp_as_buffer*/
8099 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8100 0, /*tp_doc*/
8101 0, /*tp_traverse*/
8102 0, /*tp_clear*/
8103 0, /*tp_richcompare*/
8104 0, /*tp_weaklistoffset*/
8105 0, /*tp_iter*/
8106 0, /*tp_iternext*/
8107 encoding_map_methods, /*tp_methods*/
8108 0, /*tp_members*/
8109 0, /*tp_getset*/
8110 0, /*tp_base*/
8111 0, /*tp_dict*/
8112 0, /*tp_descr_get*/
8113 0, /*tp_descr_set*/
8114 0, /*tp_dictoffset*/
8115 0, /*tp_init*/
8116 0, /*tp_alloc*/
8117 0, /*tp_new*/
8118 0, /*tp_free*/
8119 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008120};
8121
8122PyObject*
8123PyUnicode_BuildEncodingMap(PyObject* string)
8124{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008125 PyObject *result;
8126 struct encoding_map *mresult;
8127 int i;
8128 int need_dict = 0;
8129 unsigned char level1[32];
8130 unsigned char level2[512];
8131 unsigned char *mlevel1, *mlevel2, *mlevel3;
8132 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008133 int kind;
8134 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008135 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008136 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008137
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008138 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008139 PyErr_BadArgument();
8140 return NULL;
8141 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008142 kind = PyUnicode_KIND(string);
8143 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008144 length = PyUnicode_GET_LENGTH(string);
8145 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008146 memset(level1, 0xFF, sizeof level1);
8147 memset(level2, 0xFF, sizeof level2);
8148
8149 /* If there isn't a one-to-one mapping of NULL to \0,
8150 or if there are non-BMP characters, we need to use
8151 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008152 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008153 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008154 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008155 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008156 ch = PyUnicode_READ(kind, data, i);
8157 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008158 need_dict = 1;
8159 break;
8160 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008161 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008162 /* unmapped character */
8163 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008164 l1 = ch >> 11;
8165 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008166 if (level1[l1] == 0xFF)
8167 level1[l1] = count2++;
8168 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008169 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008170 }
8171
8172 if (count2 >= 0xFF || count3 >= 0xFF)
8173 need_dict = 1;
8174
8175 if (need_dict) {
8176 PyObject *result = PyDict_New();
8177 PyObject *key, *value;
8178 if (!result)
8179 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008180 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008181 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008182 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008183 if (!key || !value)
8184 goto failed1;
8185 if (PyDict_SetItem(result, key, value) == -1)
8186 goto failed1;
8187 Py_DECREF(key);
8188 Py_DECREF(value);
8189 }
8190 return result;
8191 failed1:
8192 Py_XDECREF(key);
8193 Py_XDECREF(value);
8194 Py_DECREF(result);
8195 return NULL;
8196 }
8197
8198 /* Create a three-level trie */
8199 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8200 16*count2 + 128*count3 - 1);
8201 if (!result)
8202 return PyErr_NoMemory();
8203 PyObject_Init(result, &EncodingMapType);
8204 mresult = (struct encoding_map*)result;
8205 mresult->count2 = count2;
8206 mresult->count3 = count3;
8207 mlevel1 = mresult->level1;
8208 mlevel2 = mresult->level23;
8209 mlevel3 = mresult->level23 + 16*count2;
8210 memcpy(mlevel1, level1, 32);
8211 memset(mlevel2, 0xFF, 16*count2);
8212 memset(mlevel3, 0, 128*count3);
8213 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008214 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008215 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008216 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8217 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008218 /* unmapped character */
8219 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008220 o1 = ch>>11;
8221 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008222 i2 = 16*mlevel1[o1] + o2;
8223 if (mlevel2[i2] == 0xFF)
8224 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008225 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008226 i3 = 128*mlevel2[i2] + o3;
8227 mlevel3[i3] = i;
8228 }
8229 return result;
8230}
8231
8232static int
Victor Stinner22168992011-11-20 17:09:18 +01008233encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008234{
8235 struct encoding_map *map = (struct encoding_map*)mapping;
8236 int l1 = c>>11;
8237 int l2 = (c>>7) & 0xF;
8238 int l3 = c & 0x7F;
8239 int i;
8240
Victor Stinner22168992011-11-20 17:09:18 +01008241 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008242 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008243 if (c == 0)
8244 return 0;
8245 /* level 1*/
8246 i = map->level1[l1];
8247 if (i == 0xFF) {
8248 return -1;
8249 }
8250 /* level 2*/
8251 i = map->level23[16*i+l2];
8252 if (i == 0xFF) {
8253 return -1;
8254 }
8255 /* level 3 */
8256 i = map->level23[16*map->count2 + 128*i + l3];
8257 if (i == 0) {
8258 return -1;
8259 }
8260 return i;
8261}
8262
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008263/* Lookup the character ch in the mapping. If the character
8264 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008265 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008266static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008267charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008268{
Christian Heimes217cfd12007-12-02 14:31:20 +00008269 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008270 PyObject *x;
8271
8272 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008273 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008274 x = PyObject_GetItem(mapping, w);
8275 Py_DECREF(w);
8276 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8278 /* No mapping found means: mapping is undefined. */
8279 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008280 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008281 } else
8282 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008284 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008285 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008286 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008287 long value = PyLong_AS_LONG(x);
8288 if (value < 0 || value > 255) {
8289 PyErr_SetString(PyExc_TypeError,
8290 "character mapping must be in range(256)");
8291 Py_DECREF(x);
8292 return NULL;
8293 }
8294 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008296 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008297 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008299 /* wrong return value */
8300 PyErr_Format(PyExc_TypeError,
8301 "character mapping must return integer, bytes or None, not %.400s",
8302 x->ob_type->tp_name);
8303 Py_DECREF(x);
8304 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305 }
8306}
8307
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008308static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008309charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008310{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008311 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8312 /* exponentially overallocate to minimize reallocations */
8313 if (requiredsize < 2*outsize)
8314 requiredsize = 2*outsize;
8315 if (_PyBytes_Resize(outobj, requiredsize))
8316 return -1;
8317 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008318}
8319
Benjamin Peterson14339b62009-01-31 16:36:08 +00008320typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008321 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008322} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008323/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008324 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008325 space is available. Return a new reference to the object that
8326 was put in the output buffer, or Py_None, if the mapping was undefined
8327 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008328 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008329static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008330charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008331 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008332{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008333 PyObject *rep;
8334 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008335 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008336
Christian Heimes90aa7642007-12-19 02:45:37 +00008337 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008338 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008339 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008340 if (res == -1)
8341 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008342 if (outsize<requiredsize)
8343 if (charmapencode_resize(outobj, outpos, requiredsize))
8344 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008345 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 outstart[(*outpos)++] = (char)res;
8347 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008348 }
8349
8350 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008351 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008352 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008353 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 Py_DECREF(rep);
8355 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008356 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008357 if (PyLong_Check(rep)) {
8358 Py_ssize_t requiredsize = *outpos+1;
8359 if (outsize<requiredsize)
8360 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8361 Py_DECREF(rep);
8362 return enc_EXCEPTION;
8363 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008364 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008366 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 else {
8368 const char *repchars = PyBytes_AS_STRING(rep);
8369 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8370 Py_ssize_t requiredsize = *outpos+repsize;
8371 if (outsize<requiredsize)
8372 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8373 Py_DECREF(rep);
8374 return enc_EXCEPTION;
8375 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008376 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 memcpy(outstart + *outpos, repchars, repsize);
8378 *outpos += repsize;
8379 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008380 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008381 Py_DECREF(rep);
8382 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008383}
8384
8385/* handle an error in PyUnicode_EncodeCharmap
8386 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008387static int
8388charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008389 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008390 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008391 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008392 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008393{
8394 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008395 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008396 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008397 enum PyUnicode_Kind kind;
8398 void *data;
8399 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008400 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008401 Py_ssize_t collstartpos = *inpos;
8402 Py_ssize_t collendpos = *inpos+1;
8403 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008404 const char *encoding = "charmap";
8405 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008406 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008407 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008408 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008409
Benjamin Petersonbac79492012-01-14 13:34:47 -05008410 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008411 return -1;
8412 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008413 /* find all unencodable characters */
8414 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008415 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008416 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008417 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008418 val = encoding_map_lookup(ch, mapping);
8419 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008420 break;
8421 ++collendpos;
8422 continue;
8423 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008424
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008425 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8426 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008427 if (rep==NULL)
8428 return -1;
8429 else if (rep!=Py_None) {
8430 Py_DECREF(rep);
8431 break;
8432 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008433 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008434 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008435 }
8436 /* cache callback name lookup
8437 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008438 if (*error_handler == _Py_ERROR_UNKNOWN)
8439 *error_handler = get_error_handler(errors);
8440
8441 switch (*error_handler) {
8442 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008443 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008444 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008445
8446 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008447 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 x = charmapencode_output('?', mapping, res, respos);
8449 if (x==enc_EXCEPTION) {
8450 return -1;
8451 }
8452 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008453 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 return -1;
8455 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008456 }
8457 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008458 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008459 *inpos = collendpos;
8460 break;
Victor Stinner50149202015-09-22 00:26:54 +02008461
8462 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008463 /* generate replacement (temporarily (mis)uses p) */
8464 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008465 char buffer[2+29+1+1];
8466 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008467 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008468 for (cp = buffer; *cp; ++cp) {
8469 x = charmapencode_output(*cp, mapping, res, respos);
8470 if (x==enc_EXCEPTION)
8471 return -1;
8472 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008473 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 return -1;
8475 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008476 }
8477 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008478 *inpos = collendpos;
8479 break;
Victor Stinner50149202015-09-22 00:26:54 +02008480
Benjamin Peterson14339b62009-01-31 16:36:08 +00008481 default:
Victor Stinner50149202015-09-22 00:26:54 +02008482 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008483 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008484 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008485 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008486 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008487 if (PyBytes_Check(repunicode)) {
8488 /* Directly copy bytes result to output. */
8489 Py_ssize_t outsize = PyBytes_Size(*res);
8490 Py_ssize_t requiredsize;
8491 repsize = PyBytes_Size(repunicode);
8492 requiredsize = *respos + repsize;
8493 if (requiredsize > outsize)
8494 /* Make room for all additional bytes. */
8495 if (charmapencode_resize(res, respos, requiredsize)) {
8496 Py_DECREF(repunicode);
8497 return -1;
8498 }
8499 memcpy(PyBytes_AsString(*res) + *respos,
8500 PyBytes_AsString(repunicode), repsize);
8501 *respos += repsize;
8502 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008503 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008504 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008505 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008506 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008507 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008508 Py_DECREF(repunicode);
8509 return -1;
8510 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008511 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008512 data = PyUnicode_DATA(repunicode);
8513 kind = PyUnicode_KIND(repunicode);
8514 for (index = 0; index < repsize; index++) {
8515 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8516 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008518 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008519 return -1;
8520 }
8521 else if (x==enc_FAILED) {
8522 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008523 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 return -1;
8525 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008526 }
8527 *inpos = newpos;
8528 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008529 }
8530 return 0;
8531}
8532
Alexander Belopolsky40018472011-02-26 01:02:56 +00008533PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008534_PyUnicode_EncodeCharmap(PyObject *unicode,
8535 PyObject *mapping,
8536 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008538 /* output object */
8539 PyObject *res = NULL;
8540 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008541 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008542 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008543 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008544 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008545 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008546 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008547 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008548 void *data;
8549 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008550
Benjamin Petersonbac79492012-01-14 13:34:47 -05008551 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008552 return NULL;
8553 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008554 data = PyUnicode_DATA(unicode);
8555 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008556
Guido van Rossumd57fd912000-03-10 22:53:23 +00008557 /* Default to Latin-1 */
8558 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008559 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008560
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008561 /* allocate enough for a simple encoding without
8562 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008563 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008564 if (res == NULL)
8565 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008566 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008569 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008570 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008572 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 if (x==enc_EXCEPTION) /* error */
8574 goto onError;
8575 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008576 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008578 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008579 &res, &respos)) {
8580 goto onError;
8581 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008582 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 else
8584 /* done with this character => adjust input position */
8585 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008587
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008588 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008589 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008590 if (_PyBytes_Resize(&res, respos) < 0)
8591 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008592
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008593 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008594 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008595 return res;
8596
Benjamin Peterson29060642009-01-31 22:14:21 +00008597 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008598 Py_XDECREF(res);
8599 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008600 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601 return NULL;
8602}
8603
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008604/* Deprecated */
8605PyObject *
8606PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8607 Py_ssize_t size,
8608 PyObject *mapping,
8609 const char *errors)
8610{
8611 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008612 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008613 if (unicode == NULL)
8614 return NULL;
8615 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8616 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008617 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008618}
8619
Alexander Belopolsky40018472011-02-26 01:02:56 +00008620PyObject *
8621PyUnicode_AsCharmapString(PyObject *unicode,
8622 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623{
8624 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008625 PyErr_BadArgument();
8626 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008628 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629}
8630
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008631/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008632static void
8633make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008634 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008635 Py_ssize_t startpos, Py_ssize_t endpos,
8636 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008638 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008639 *exceptionObject = _PyUnicodeTranslateError_Create(
8640 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641 }
8642 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008643 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8644 goto onError;
8645 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8646 goto onError;
8647 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8648 goto onError;
8649 return;
8650 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008651 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652 }
8653}
8654
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008655/* error handling callback helper:
8656 build arguments, call the callback and check the arguments,
8657 put the result into newpos and return the replacement string, which
8658 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008659static PyObject *
8660unicode_translate_call_errorhandler(const char *errors,
8661 PyObject **errorHandler,
8662 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008663 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008664 Py_ssize_t startpos, Py_ssize_t endpos,
8665 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008666{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008667 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008668
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008669 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008670 PyObject *restuple;
8671 PyObject *resunicode;
8672
8673 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008674 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008675 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008676 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008677 }
8678
8679 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008680 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008681 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008683
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008684 restuple = PyObject_CallFunctionObjArgs(
8685 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008686 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008688 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008689 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008690 Py_DECREF(restuple);
8691 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008692 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008693 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 &resunicode, &i_newpos)) {
8695 Py_DECREF(restuple);
8696 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008697 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008698 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008699 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008700 else
8701 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008702 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008703 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 Py_DECREF(restuple);
8705 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008706 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008707 Py_INCREF(resunicode);
8708 Py_DECREF(restuple);
8709 return resunicode;
8710}
8711
8712/* Lookup the character ch in the mapping and put the result in result,
8713 which must be decrefed by the caller.
8714 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008715static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008716charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008717{
Christian Heimes217cfd12007-12-02 14:31:20 +00008718 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008719 PyObject *x;
8720
8721 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008723 x = PyObject_GetItem(mapping, w);
8724 Py_DECREF(w);
8725 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8727 /* No mapping found means: use 1:1 mapping. */
8728 PyErr_Clear();
8729 *result = NULL;
8730 return 0;
8731 } else
8732 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008733 }
8734 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008735 *result = x;
8736 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008737 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008738 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008739 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008740 if (value < 0 || value > MAX_UNICODE) {
8741 PyErr_Format(PyExc_ValueError,
8742 "character mapping must be in range(0x%x)",
8743 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008744 Py_DECREF(x);
8745 return -1;
8746 }
8747 *result = x;
8748 return 0;
8749 }
8750 else if (PyUnicode_Check(x)) {
8751 *result = x;
8752 return 0;
8753 }
8754 else {
8755 /* wrong return value */
8756 PyErr_SetString(PyExc_TypeError,
8757 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008758 Py_DECREF(x);
8759 return -1;
8760 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008761}
Victor Stinner1194ea02014-04-04 19:37:40 +02008762
8763/* lookup the character, write the result into the writer.
8764 Return 1 if the result was written into the writer, return 0 if the mapping
8765 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008766static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008767charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8768 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008769{
Victor Stinner1194ea02014-04-04 19:37:40 +02008770 PyObject *item;
8771
8772 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008774
8775 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008776 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008777 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008778 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008779 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008780 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008781 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008782
8783 if (item == Py_None) {
8784 Py_DECREF(item);
8785 return 0;
8786 }
8787
8788 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008789 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8790 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8791 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008792 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8793 Py_DECREF(item);
8794 return -1;
8795 }
8796 Py_DECREF(item);
8797 return 1;
8798 }
8799
8800 if (!PyUnicode_Check(item)) {
8801 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008802 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008803 }
8804
8805 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8806 Py_DECREF(item);
8807 return -1;
8808 }
8809
8810 Py_DECREF(item);
8811 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008812}
8813
Victor Stinner89a76ab2014-04-05 11:44:04 +02008814static int
8815unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8816 Py_UCS1 *translate)
8817{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008818 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008819 int ret = 0;
8820
Victor Stinner89a76ab2014-04-05 11:44:04 +02008821 if (charmaptranslate_lookup(ch, mapping, &item)) {
8822 return -1;
8823 }
8824
8825 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008826 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008827 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008828 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008829 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008830 /* not found => default to 1:1 mapping */
8831 translate[ch] = ch;
8832 return 1;
8833 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008834 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008835 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008836 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8837 used it */
8838 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008839 /* invalid character or character outside ASCII:
8840 skip the fast translate */
8841 goto exit;
8842 }
8843 translate[ch] = (Py_UCS1)replace;
8844 }
8845 else if (PyUnicode_Check(item)) {
8846 Py_UCS4 replace;
8847
8848 if (PyUnicode_READY(item) == -1) {
8849 Py_DECREF(item);
8850 return -1;
8851 }
8852 if (PyUnicode_GET_LENGTH(item) != 1)
8853 goto exit;
8854
8855 replace = PyUnicode_READ_CHAR(item, 0);
8856 if (replace > 127)
8857 goto exit;
8858 translate[ch] = (Py_UCS1)replace;
8859 }
8860 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008861 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008862 goto exit;
8863 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008864 ret = 1;
8865
Benjamin Peterson1365de72014-04-07 20:15:41 -04008866 exit:
8867 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008868 return ret;
8869}
8870
8871/* Fast path for ascii => ascii translation. Return 1 if the whole string
8872 was translated into writer, return 0 if the input string was partially
8873 translated into writer, raise an exception and return -1 on error. */
8874static int
8875unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008876 _PyUnicodeWriter *writer, int ignore,
8877 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008878{
Victor Stinner872b2912014-04-05 14:27:07 +02008879 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008880 Py_ssize_t len;
8881 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008882 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008883
Victor Stinner89a76ab2014-04-05 11:44:04 +02008884 len = PyUnicode_GET_LENGTH(input);
8885
Victor Stinner872b2912014-04-05 14:27:07 +02008886 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008887
8888 in = PyUnicode_1BYTE_DATA(input);
8889 end = in + len;
8890
8891 assert(PyUnicode_IS_ASCII(writer->buffer));
8892 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8893 out = PyUnicode_1BYTE_DATA(writer->buffer);
8894
Victor Stinner872b2912014-04-05 14:27:07 +02008895 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008896 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008897 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008898 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008899 int translate = unicode_fast_translate_lookup(mapping, ch,
8900 ascii_table);
8901 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008902 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008903 if (translate == 0)
8904 goto exit;
8905 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008906 }
Victor Stinner872b2912014-04-05 14:27:07 +02008907 if (ch2 == 0xfe) {
8908 if (ignore)
8909 continue;
8910 goto exit;
8911 }
8912 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008913 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008914 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008915 }
Victor Stinner872b2912014-04-05 14:27:07 +02008916 res = 1;
8917
8918exit:
8919 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008920 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008921 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008922}
8923
Victor Stinner3222da22015-10-01 22:07:32 +02008924static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008925_PyUnicode_TranslateCharmap(PyObject *input,
8926 PyObject *mapping,
8927 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008928{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008929 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008930 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008931 Py_ssize_t size, i;
8932 int kind;
8933 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008934 _PyUnicodeWriter writer;
8935 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008936 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008937 PyObject *errorHandler = NULL;
8938 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008939 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008940 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008941
Guido van Rossumd57fd912000-03-10 22:53:23 +00008942 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008943 PyErr_BadArgument();
8944 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008945 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008947 if (PyUnicode_READY(input) == -1)
8948 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008949 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008950 kind = PyUnicode_KIND(input);
8951 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008952
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008953 if (size == 0)
8954 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008955
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008956 /* allocate enough for a simple 1:1 translation without
8957 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008958 _PyUnicodeWriter_Init(&writer);
8959 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008960 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008961
Victor Stinner872b2912014-04-05 14:27:07 +02008962 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8963
Victor Stinner33798672016-03-01 21:59:58 +01008964 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008965 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008966 if (PyUnicode_IS_ASCII(input)) {
8967 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8968 if (res < 0) {
8969 _PyUnicodeWriter_Dealloc(&writer);
8970 return NULL;
8971 }
8972 if (res == 1)
8973 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008974 }
Victor Stinner33798672016-03-01 21:59:58 +01008975 else {
8976 i = 0;
8977 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008979 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008980 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008981 int translate;
8982 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8983 Py_ssize_t newpos;
8984 /* startpos for collecting untranslatable chars */
8985 Py_ssize_t collstart;
8986 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008987 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988
Victor Stinner1194ea02014-04-04 19:37:40 +02008989 ch = PyUnicode_READ(kind, data, i);
8990 translate = charmaptranslate_output(ch, mapping, &writer);
8991 if (translate < 0)
8992 goto onError;
8993
8994 if (translate != 0) {
8995 /* it worked => adjust input pointer */
8996 ++i;
8997 continue;
8998 }
8999
9000 /* untranslatable character */
9001 collstart = i;
9002 collend = i+1;
9003
9004 /* find all untranslatable characters */
9005 while (collend < size) {
9006 PyObject *x;
9007 ch = PyUnicode_READ(kind, data, collend);
9008 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009009 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009010 Py_XDECREF(x);
9011 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009012 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009013 ++collend;
9014 }
9015
9016 if (ignore) {
9017 i = collend;
9018 }
9019 else {
9020 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9021 reason, input, &exc,
9022 collstart, collend, &newpos);
9023 if (repunicode == NULL)
9024 goto onError;
9025 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009026 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009027 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009028 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009029 Py_DECREF(repunicode);
9030 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009031 }
9032 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009033 Py_XDECREF(exc);
9034 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009035 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009036
Benjamin Peterson29060642009-01-31 22:14:21 +00009037 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009038 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009039 Py_XDECREF(exc);
9040 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041 return NULL;
9042}
9043
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009044/* Deprecated. Use PyUnicode_Translate instead. */
9045PyObject *
9046PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9047 Py_ssize_t size,
9048 PyObject *mapping,
9049 const char *errors)
9050{
Christian Heimes5f520f42012-09-11 14:03:25 +02009051 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009052 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009053 if (!unicode)
9054 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009055 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9056 Py_DECREF(unicode);
9057 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009058}
9059
Alexander Belopolsky40018472011-02-26 01:02:56 +00009060PyObject *
9061PyUnicode_Translate(PyObject *str,
9062 PyObject *mapping,
9063 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009064{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009065 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009066 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009067 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009068}
Tim Petersced69f82003-09-16 20:30:58 +00009069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009070PyObject *
9071_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9072{
9073 if (!PyUnicode_Check(unicode)) {
9074 PyErr_BadInternalCall();
9075 return NULL;
9076 }
9077 if (PyUnicode_READY(unicode) == -1)
9078 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009079 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009080 /* If the string is already ASCII, just return the same string */
9081 Py_INCREF(unicode);
9082 return unicode;
9083 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009084
9085 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9086 PyObject *result = PyUnicode_New(len, 127);
9087 if (result == NULL) {
9088 return NULL;
9089 }
9090
9091 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9092 int kind = PyUnicode_KIND(unicode);
9093 const void *data = PyUnicode_DATA(unicode);
9094 Py_ssize_t i;
9095 for (i = 0; i < len; ++i) {
9096 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9097 if (ch < 127) {
9098 out[i] = ch;
9099 }
9100 else if (Py_UNICODE_ISSPACE(ch)) {
9101 out[i] = ' ';
9102 }
9103 else {
9104 int decimal = Py_UNICODE_TODECIMAL(ch);
9105 if (decimal < 0) {
9106 out[i] = '?';
Miss Islington (bot)c7214722018-07-13 20:58:12 -07009107 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009108 _PyUnicode_LENGTH(result) = i + 1;
9109 break;
9110 }
9111 out[i] = '0' + decimal;
9112 }
9113 }
9114
Miss Islington (bot)c7214722018-07-13 20:58:12 -07009115 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009116 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009117}
9118
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009119PyObject *
9120PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9121 Py_ssize_t length)
9122{
Victor Stinnerf0124502011-11-21 23:12:56 +01009123 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009124 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009125 Py_UCS4 maxchar;
9126 enum PyUnicode_Kind kind;
9127 void *data;
9128
Victor Stinner99d7ad02012-02-22 13:37:39 +01009129 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009130 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009131 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009132 if (ch > 127) {
9133 int decimal = Py_UNICODE_TODECIMAL(ch);
9134 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009135 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009136 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009137 }
9138 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009139
9140 /* Copy to a new string */
9141 decimal = PyUnicode_New(length, maxchar);
9142 if (decimal == NULL)
9143 return decimal;
9144 kind = PyUnicode_KIND(decimal);
9145 data = PyUnicode_DATA(decimal);
9146 /* Iterate over code points */
9147 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009148 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009149 if (ch > 127) {
9150 int decimal = Py_UNICODE_TODECIMAL(ch);
9151 if (decimal >= 0)
9152 ch = '0' + decimal;
9153 }
9154 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009155 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009156 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009157}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009158/* --- Decimal Encoder ---------------------------------------------------- */
9159
Alexander Belopolsky40018472011-02-26 01:02:56 +00009160int
9161PyUnicode_EncodeDecimal(Py_UNICODE *s,
9162 Py_ssize_t length,
9163 char *output,
9164 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009165{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009166 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009167 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009168 enum PyUnicode_Kind kind;
9169 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009170
9171 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009172 PyErr_BadArgument();
9173 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009174 }
9175
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009176 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009177 if (unicode == NULL)
9178 return -1;
9179
Victor Stinner42bf7752011-11-21 22:52:58 +01009180 kind = PyUnicode_KIND(unicode);
9181 data = PyUnicode_DATA(unicode);
9182
Victor Stinnerb84d7232011-11-22 01:50:07 +01009183 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009184 PyObject *exc;
9185 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009186 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009187 Py_ssize_t startpos;
9188
9189 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009190
Benjamin Peterson29060642009-01-31 22:14:21 +00009191 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009192 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009193 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009194 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009195 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009196 decimal = Py_UNICODE_TODECIMAL(ch);
9197 if (decimal >= 0) {
9198 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009199 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009200 continue;
9201 }
9202 if (0 < ch && ch < 256) {
9203 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009204 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009205 continue;
9206 }
Victor Stinner6345be92011-11-25 20:09:01 +01009207
Victor Stinner42bf7752011-11-21 22:52:58 +01009208 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009209 exc = NULL;
9210 raise_encode_exception(&exc, "decimal", unicode,
9211 startpos, startpos+1,
9212 "invalid decimal Unicode string");
9213 Py_XDECREF(exc);
9214 Py_DECREF(unicode);
9215 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009216 }
9217 /* 0-terminate the output string */
9218 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009219 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009220 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009221}
9222
Guido van Rossumd57fd912000-03-10 22:53:23 +00009223/* --- Helpers ------------------------------------------------------------ */
9224
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009225/* helper macro to fixup start/end slice values */
9226#define ADJUST_INDICES(start, end, len) \
9227 if (end > len) \
9228 end = len; \
9229 else if (end < 0) { \
9230 end += len; \
9231 if (end < 0) \
9232 end = 0; \
9233 } \
9234 if (start < 0) { \
9235 start += len; \
9236 if (start < 0) \
9237 start = 0; \
9238 }
9239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009240static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009241any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009243 Py_ssize_t end,
9244 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009245{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009246 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009247 void *buf1, *buf2;
9248 Py_ssize_t len1, len2, result;
9249
9250 kind1 = PyUnicode_KIND(s1);
9251 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009252 if (kind1 < kind2)
9253 return -1;
9254
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009255 len1 = PyUnicode_GET_LENGTH(s1);
9256 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009257 ADJUST_INDICES(start, end, len1);
9258 if (end - start < len2)
9259 return -1;
9260
9261 buf1 = PyUnicode_DATA(s1);
9262 buf2 = PyUnicode_DATA(s2);
9263 if (len2 == 1) {
9264 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9265 result = findchar((const char *)buf1 + kind1*start,
9266 kind1, end - start, ch, direction);
9267 if (result == -1)
9268 return -1;
9269 else
9270 return start + result;
9271 }
9272
9273 if (kind2 != kind1) {
9274 buf2 = _PyUnicode_AsKind(s2, kind1);
9275 if (!buf2)
9276 return -2;
9277 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009278
Victor Stinner794d5672011-10-10 03:21:36 +02009279 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009280 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009281 case PyUnicode_1BYTE_KIND:
9282 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9283 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9284 else
9285 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9286 break;
9287 case PyUnicode_2BYTE_KIND:
9288 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9289 break;
9290 case PyUnicode_4BYTE_KIND:
9291 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9292 break;
9293 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009294 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009295 }
9296 }
9297 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009298 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009299 case PyUnicode_1BYTE_KIND:
9300 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9301 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9302 else
9303 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9304 break;
9305 case PyUnicode_2BYTE_KIND:
9306 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9307 break;
9308 case PyUnicode_4BYTE_KIND:
9309 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9310 break;
9311 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009312 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009313 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009314 }
9315
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009316 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009317 PyMem_Free(buf2);
9318
9319 return result;
9320}
9321
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009322/* _PyUnicode_InsertThousandsGrouping() helper functions */
9323#include "stringlib/localeutil.h"
9324
9325/**
9326 * InsertThousandsGrouping:
9327 * @writer: Unicode writer.
9328 * @n_buffer: Number of characters in @buffer.
9329 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9330 * @d_pos: Start of digits string.
9331 * @n_digits: The number of digits in the string, in which we want
9332 * to put the grouping chars.
9333 * @min_width: The minimum width of the digits in the output string.
9334 * Output will be zero-padded on the left to fill.
9335 * @grouping: see definition in localeconv().
9336 * @thousands_sep: see definition in localeconv().
9337 *
9338 * There are 2 modes: counting and filling. If @writer is NULL,
9339 * we are in counting mode, else filling mode.
9340 * If counting, the required buffer size is returned.
9341 * If filling, we know the buffer will be large enough, so we don't
9342 * need to pass in the buffer size.
9343 * Inserts thousand grouping characters (as defined by grouping and
9344 * thousands_sep) into @writer.
9345 *
9346 * Return value: -1 on error, number of characters otherwise.
9347 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009348Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009349_PyUnicode_InsertThousandsGrouping(
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009350 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009351 Py_ssize_t n_buffer,
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009352 PyObject *digits,
9353 Py_ssize_t d_pos,
9354 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009355 Py_ssize_t min_width,
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009356 const char *grouping,
9357 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009358 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009359{
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009360 if (writer) {
9361 assert(digits != NULL);
9362 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009363 }
9364 else {
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009365 assert(digits == NULL);
9366 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009367 }
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009368 assert(0 <= d_pos);
9369 assert(0 <= n_digits);
9370 assert(0 <= min_width);
9371 assert(grouping != NULL);
9372
9373 if (digits != NULL) {
9374 if (PyUnicode_READY(digits) == -1) {
9375 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009376 }
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009377 }
9378 if (PyUnicode_READY(thousands_sep) == -1) {
9379 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009380 }
9381
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009382 Py_ssize_t count = 0;
9383 Py_ssize_t n_zeros;
9384 int loop_broken = 0;
9385 int use_separator = 0; /* First time through, don't append the
9386 separator. They only go between
9387 groups. */
9388 Py_ssize_t buffer_pos;
9389 Py_ssize_t digits_pos;
9390 Py_ssize_t len;
9391 Py_ssize_t n_chars;
9392 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9393 be looked at */
9394 /* A generator that returns all of the grouping widths, until it
9395 returns 0. */
9396 GroupGenerator groupgen;
9397 GroupGenerator_init(&groupgen, grouping);
9398 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9399
9400 /* if digits are not grouped, thousands separator
9401 should be an empty string */
9402 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9403
9404 digits_pos = d_pos + n_digits;
9405 if (writer) {
9406 buffer_pos = writer->pos + n_buffer;
9407 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9408 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009409 }
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009410 else {
9411 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009412 }
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009413
9414 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009415 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009416 }
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009417
9418 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9419 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9420 n_zeros = Py_MAX(0, len - remaining);
9421 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9422
9423 /* Use n_zero zero's and n_chars chars */
9424
9425 /* Count only, don't do anything. */
9426 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9427
9428 /* Copy into the writer. */
9429 InsertThousandsGrouping_fill(writer, &buffer_pos,
9430 digits, &digits_pos,
9431 n_chars, n_zeros,
9432 use_separator ? thousands_sep : NULL,
9433 thousands_sep_len, maxchar);
9434
9435 /* Use a separator next time. */
9436 use_separator = 1;
9437
9438 remaining -= n_chars;
9439 min_width -= len;
9440
9441 if (remaining <= 0 && min_width <= 0) {
9442 loop_broken = 1;
9443 break;
9444 }
9445 min_width -= thousands_sep_len;
9446 }
9447 if (!loop_broken) {
9448 /* We left the loop without using a break statement. */
9449
9450 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9451 n_zeros = Py_MAX(0, len - remaining);
9452 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9453
9454 /* Use n_zero zero's and n_chars chars */
9455 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9456
9457 /* Copy into the writer. */
9458 InsertThousandsGrouping_fill(writer, &buffer_pos,
9459 digits, &digits_pos,
9460 n_chars, n_zeros,
9461 use_separator ? thousands_sep : NULL,
9462 thousands_sep_len, maxchar);
9463 }
9464 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465}
9466
9467
Alexander Belopolsky40018472011-02-26 01:02:56 +00009468Py_ssize_t
9469PyUnicode_Count(PyObject *str,
9470 PyObject *substr,
9471 Py_ssize_t start,
9472 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009473{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009474 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009475 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009476 void *buf1 = NULL, *buf2 = NULL;
9477 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009478
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009479 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009480 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009481
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009482 kind1 = PyUnicode_KIND(str);
9483 kind2 = PyUnicode_KIND(substr);
9484 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009485 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009486
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009487 len1 = PyUnicode_GET_LENGTH(str);
9488 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009490 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009491 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009492
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009493 buf1 = PyUnicode_DATA(str);
9494 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009495 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009496 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009497 if (!buf2)
9498 goto onError;
9499 }
9500
9501 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009503 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009504 result = asciilib_count(
9505 ((Py_UCS1*)buf1) + start, end - start,
9506 buf2, len2, PY_SSIZE_T_MAX
9507 );
9508 else
9509 result = ucs1lib_count(
9510 ((Py_UCS1*)buf1) + start, end - start,
9511 buf2, len2, PY_SSIZE_T_MAX
9512 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009513 break;
9514 case PyUnicode_2BYTE_KIND:
9515 result = ucs2lib_count(
9516 ((Py_UCS2*)buf1) + start, end - start,
9517 buf2, len2, PY_SSIZE_T_MAX
9518 );
9519 break;
9520 case PyUnicode_4BYTE_KIND:
9521 result = ucs4lib_count(
9522 ((Py_UCS4*)buf1) + start, end - start,
9523 buf2, len2, PY_SSIZE_T_MAX
9524 );
9525 break;
9526 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009527 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009528 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009529
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009530 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009531 PyMem_Free(buf2);
9532
Guido van Rossumd57fd912000-03-10 22:53:23 +00009533 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009534 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009535 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009536 PyMem_Free(buf2);
9537 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009538}
9539
Alexander Belopolsky40018472011-02-26 01:02:56 +00009540Py_ssize_t
9541PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009542 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009543 Py_ssize_t start,
9544 Py_ssize_t end,
9545 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009546{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009547 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009548 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009549
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009550 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009551}
9552
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009553Py_ssize_t
9554PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9555 Py_ssize_t start, Py_ssize_t end,
9556 int direction)
9557{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009558 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009559 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560 if (PyUnicode_READY(str) == -1)
9561 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009562 len = PyUnicode_GET_LENGTH(str);
9563 ADJUST_INDICES(start, end, len);
9564 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009565 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009566 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009567 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9568 kind, end-start, ch, direction);
9569 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009570 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009571 else
9572 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009573}
9574
Alexander Belopolsky40018472011-02-26 01:02:56 +00009575static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009576tailmatch(PyObject *self,
9577 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009578 Py_ssize_t start,
9579 Py_ssize_t end,
9580 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009582 int kind_self;
9583 int kind_sub;
9584 void *data_self;
9585 void *data_sub;
9586 Py_ssize_t offset;
9587 Py_ssize_t i;
9588 Py_ssize_t end_sub;
9589
9590 if (PyUnicode_READY(self) == -1 ||
9591 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009592 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009594 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9595 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009596 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009597 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009598
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009599 if (PyUnicode_GET_LENGTH(substring) == 0)
9600 return 1;
9601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009602 kind_self = PyUnicode_KIND(self);
9603 data_self = PyUnicode_DATA(self);
9604 kind_sub = PyUnicode_KIND(substring);
9605 data_sub = PyUnicode_DATA(substring);
9606 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9607
9608 if (direction > 0)
9609 offset = end;
9610 else
9611 offset = start;
9612
9613 if (PyUnicode_READ(kind_self, data_self, offset) ==
9614 PyUnicode_READ(kind_sub, data_sub, 0) &&
9615 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9616 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9617 /* If both are of the same kind, memcmp is sufficient */
9618 if (kind_self == kind_sub) {
9619 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009620 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009621 data_sub,
9622 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009623 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009624 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009625 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009626 else {
9627 /* We do not need to compare 0 and len(substring)-1 because
9628 the if statement above ensured already that they are equal
9629 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630 for (i = 1; i < end_sub; ++i) {
9631 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9632 PyUnicode_READ(kind_sub, data_sub, i))
9633 return 0;
9634 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009635 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009636 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637 }
9638
9639 return 0;
9640}
9641
Alexander Belopolsky40018472011-02-26 01:02:56 +00009642Py_ssize_t
9643PyUnicode_Tailmatch(PyObject *str,
9644 PyObject *substr,
9645 Py_ssize_t start,
9646 Py_ssize_t end,
9647 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009648{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009649 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009650 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009651
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009652 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009653}
9654
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009655static PyObject *
9656ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009657{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009658 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9659 char *resdata, *data = PyUnicode_DATA(self);
9660 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009661
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009662 res = PyUnicode_New(len, 127);
9663 if (res == NULL)
9664 return NULL;
9665 resdata = PyUnicode_DATA(res);
9666 if (lower)
9667 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009668 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009669 _Py_bytes_upper(resdata, data, len);
9670 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009671}
9672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009673static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009674handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009675{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009676 Py_ssize_t j;
9677 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009678 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009679 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009680
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009681 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9682
9683 where ! is a negation and \p{xxx} is a character with property xxx.
9684 */
9685 for (j = i - 1; j >= 0; j--) {
9686 c = PyUnicode_READ(kind, data, j);
9687 if (!_PyUnicode_IsCaseIgnorable(c))
9688 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009689 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009690 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9691 if (final_sigma) {
9692 for (j = i + 1; j < length; j++) {
9693 c = PyUnicode_READ(kind, data, j);
9694 if (!_PyUnicode_IsCaseIgnorable(c))
9695 break;
9696 }
9697 final_sigma = j == length || !_PyUnicode_IsCased(c);
9698 }
9699 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009700}
9701
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009702static int
9703lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9704 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009705{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009706 /* Obscure special case. */
9707 if (c == 0x3A3) {
9708 mapped[0] = handle_capital_sigma(kind, data, length, i);
9709 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009710 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009711 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009712}
9713
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009714static Py_ssize_t
9715do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009716{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009717 Py_ssize_t i, k = 0;
9718 int n_res, j;
9719 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009720
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009721 c = PyUnicode_READ(kind, data, 0);
9722 n_res = _PyUnicode_ToUpperFull(c, mapped);
9723 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009724 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009725 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009726 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009727 for (i = 1; i < length; i++) {
9728 c = PyUnicode_READ(kind, data, i);
9729 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9730 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009731 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009732 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009733 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009734 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009735 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009736}
9737
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009738static Py_ssize_t
9739do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9740 Py_ssize_t i, k = 0;
9741
9742 for (i = 0; i < length; i++) {
9743 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9744 int n_res, j;
9745 if (Py_UNICODE_ISUPPER(c)) {
9746 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9747 }
9748 else if (Py_UNICODE_ISLOWER(c)) {
9749 n_res = _PyUnicode_ToUpperFull(c, mapped);
9750 }
9751 else {
9752 n_res = 1;
9753 mapped[0] = c;
9754 }
9755 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009756 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009757 res[k++] = mapped[j];
9758 }
9759 }
9760 return k;
9761}
9762
9763static Py_ssize_t
9764do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9765 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009766{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009767 Py_ssize_t i, k = 0;
9768
9769 for (i = 0; i < length; i++) {
9770 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9771 int n_res, j;
9772 if (lower)
9773 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9774 else
9775 n_res = _PyUnicode_ToUpperFull(c, mapped);
9776 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009777 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009778 res[k++] = mapped[j];
9779 }
9780 }
9781 return k;
9782}
9783
9784static Py_ssize_t
9785do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9786{
9787 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9788}
9789
9790static Py_ssize_t
9791do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9792{
9793 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9794}
9795
Benjamin Petersone51757f2012-01-12 21:10:29 -05009796static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009797do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9798{
9799 Py_ssize_t i, k = 0;
9800
9801 for (i = 0; i < length; i++) {
9802 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9803 Py_UCS4 mapped[3];
9804 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9805 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009806 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009807 res[k++] = mapped[j];
9808 }
9809 }
9810 return k;
9811}
9812
9813static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009814do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9815{
9816 Py_ssize_t i, k = 0;
9817 int previous_is_cased;
9818
9819 previous_is_cased = 0;
9820 for (i = 0; i < length; i++) {
9821 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9822 Py_UCS4 mapped[3];
9823 int n_res, j;
9824
9825 if (previous_is_cased)
9826 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9827 else
9828 n_res = _PyUnicode_ToTitleFull(c, mapped);
9829
9830 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009831 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009832 res[k++] = mapped[j];
9833 }
9834
9835 previous_is_cased = _PyUnicode_IsCased(c);
9836 }
9837 return k;
9838}
9839
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009840static PyObject *
9841case_operation(PyObject *self,
9842 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9843{
9844 PyObject *res = NULL;
9845 Py_ssize_t length, newlength = 0;
9846 int kind, outkind;
9847 void *data, *outdata;
9848 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9849
Benjamin Petersoneea48462012-01-16 14:28:50 -05009850 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009851
9852 kind = PyUnicode_KIND(self);
9853 data = PyUnicode_DATA(self);
9854 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009855 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009856 PyErr_SetString(PyExc_OverflowError, "string is too long");
9857 return NULL;
9858 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009859 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009860 if (tmp == NULL)
9861 return PyErr_NoMemory();
9862 newlength = perform(kind, data, length, tmp, &maxchar);
9863 res = PyUnicode_New(newlength, maxchar);
9864 if (res == NULL)
9865 goto leave;
9866 tmpend = tmp + newlength;
9867 outdata = PyUnicode_DATA(res);
9868 outkind = PyUnicode_KIND(res);
9869 switch (outkind) {
9870 case PyUnicode_1BYTE_KIND:
9871 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9872 break;
9873 case PyUnicode_2BYTE_KIND:
9874 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9875 break;
9876 case PyUnicode_4BYTE_KIND:
9877 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9878 break;
9879 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009880 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009881 }
9882 leave:
9883 PyMem_FREE(tmp);
9884 return res;
9885}
9886
Tim Peters8ce9f162004-08-27 01:49:32 +00009887PyObject *
9888PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009889{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009890 PyObject *res;
9891 PyObject *fseq;
9892 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009893 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009894
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009895 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009896 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009897 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009898 }
9899
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009900 /* NOTE: the following code can't call back into Python code,
9901 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009902 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009903
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009904 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009905 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009906 res = _PyUnicode_JoinArray(separator, items, seqlen);
9907 Py_DECREF(fseq);
9908 return res;
9909}
9910
9911PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +02009912_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009913{
9914 PyObject *res = NULL; /* the result */
9915 PyObject *sep = NULL;
9916 Py_ssize_t seplen;
9917 PyObject *item;
9918 Py_ssize_t sz, i, res_offset;
9919 Py_UCS4 maxchar;
9920 Py_UCS4 item_maxchar;
9921 int use_memcpy;
9922 unsigned char *res_data = NULL, *sep_data = NULL;
9923 PyObject *last_obj;
9924 unsigned int kind = 0;
9925
Tim Peters05eba1f2004-08-27 21:32:02 +00009926 /* If empty sequence, return u"". */
9927 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009928 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009929 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009930
Tim Peters05eba1f2004-08-27 21:32:02 +00009931 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009932 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009933 if (seqlen == 1) {
9934 if (PyUnicode_CheckExact(items[0])) {
9935 res = items[0];
9936 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009937 return res;
9938 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009939 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009940 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009941 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009942 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009943 /* Set up sep and seplen */
9944 if (separator == NULL) {
9945 /* fall back to a blank space separator */
9946 sep = PyUnicode_FromOrdinal(' ');
9947 if (!sep)
9948 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009949 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009950 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009951 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009952 else {
9953 if (!PyUnicode_Check(separator)) {
9954 PyErr_Format(PyExc_TypeError,
9955 "separator: expected str instance,"
9956 " %.80s found",
9957 Py_TYPE(separator)->tp_name);
9958 goto onError;
9959 }
9960 if (PyUnicode_READY(separator))
9961 goto onError;
9962 sep = separator;
9963 seplen = PyUnicode_GET_LENGTH(separator);
9964 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9965 /* inc refcount to keep this code path symmetric with the
9966 above case of a blank separator */
9967 Py_INCREF(sep);
9968 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009969 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009970 }
9971
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009972 /* There are at least two things to join, or else we have a subclass
9973 * of str in the sequence.
9974 * Do a pre-pass to figure out the total amount of space we'll
9975 * need (sz), and see whether all argument are strings.
9976 */
9977 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009978#ifdef Py_DEBUG
9979 use_memcpy = 0;
9980#else
9981 use_memcpy = 1;
9982#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009983 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +08009984 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009985 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009986 if (!PyUnicode_Check(item)) {
9987 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009988 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009989 " %.80s found",
9990 i, Py_TYPE(item)->tp_name);
9991 goto onError;
9992 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009993 if (PyUnicode_READY(item) == -1)
9994 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +08009995 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009996 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009997 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +08009998 if (i != 0) {
9999 add_sz += seplen;
10000 }
10001 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010002 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010003 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010004 goto onError;
10005 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010006 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010007 if (use_memcpy && last_obj != NULL) {
10008 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10009 use_memcpy = 0;
10010 }
10011 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010012 }
Tim Petersced69f82003-09-16 20:30:58 +000010013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010014 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010015 if (res == NULL)
10016 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010017
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010018 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010019#ifdef Py_DEBUG
10020 use_memcpy = 0;
10021#else
10022 if (use_memcpy) {
10023 res_data = PyUnicode_1BYTE_DATA(res);
10024 kind = PyUnicode_KIND(res);
10025 if (seplen != 0)
10026 sep_data = PyUnicode_1BYTE_DATA(sep);
10027 }
10028#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010029 if (use_memcpy) {
10030 for (i = 0; i < seqlen; ++i) {
10031 Py_ssize_t itemlen;
10032 item = items[i];
10033
10034 /* Copy item, and maybe the separator. */
10035 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010036 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010037 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010038 kind * seplen);
10039 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010040 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010041
10042 itemlen = PyUnicode_GET_LENGTH(item);
10043 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010044 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010045 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010046 kind * itemlen);
10047 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010048 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010049 }
10050 assert(res_data == PyUnicode_1BYTE_DATA(res)
10051 + kind * PyUnicode_GET_LENGTH(res));
10052 }
10053 else {
10054 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10055 Py_ssize_t itemlen;
10056 item = items[i];
10057
10058 /* Copy item, and maybe the separator. */
10059 if (i && seplen != 0) {
10060 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10061 res_offset += seplen;
10062 }
10063
10064 itemlen = PyUnicode_GET_LENGTH(item);
10065 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010066 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010067 res_offset += itemlen;
10068 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010069 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010070 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010071 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010072
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010073 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010074 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010076
Benjamin Peterson29060642009-01-31 22:14:21 +000010077 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010078 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010079 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010080 return NULL;
10081}
10082
Victor Stinnerd3f08822012-05-29 12:57:52 +020010083void
10084_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10085 Py_UCS4 fill_char)
10086{
10087 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner7f9fb0f2018-11-27 12:42:04 +010010088 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010089 assert(PyUnicode_IS_READY(unicode));
10090 assert(unicode_modifiable(unicode));
10091 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10092 assert(start >= 0);
10093 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10094 FILL(kind, data, fill_char, start, length);
10095}
10096
Victor Stinner3fe55312012-01-04 00:33:50 +010010097Py_ssize_t
10098PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10099 Py_UCS4 fill_char)
10100{
10101 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010102
10103 if (!PyUnicode_Check(unicode)) {
10104 PyErr_BadInternalCall();
10105 return -1;
10106 }
10107 if (PyUnicode_READY(unicode) == -1)
10108 return -1;
10109 if (unicode_check_modifiable(unicode))
10110 return -1;
10111
Victor Stinnerd3f08822012-05-29 12:57:52 +020010112 if (start < 0) {
10113 PyErr_SetString(PyExc_IndexError, "string index out of range");
10114 return -1;
10115 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010116 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10117 PyErr_SetString(PyExc_ValueError,
10118 "fill character is bigger than "
10119 "the string maximum character");
10120 return -1;
10121 }
10122
10123 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10124 length = Py_MIN(maxlen, length);
10125 if (length <= 0)
10126 return 0;
10127
Victor Stinnerd3f08822012-05-29 12:57:52 +020010128 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010129 return length;
10130}
10131
Victor Stinner9310abb2011-10-05 00:59:23 +020010132static PyObject *
10133pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010134 Py_ssize_t left,
10135 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010136 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010137{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010138 PyObject *u;
10139 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010140 int kind;
10141 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010142
10143 if (left < 0)
10144 left = 0;
10145 if (right < 0)
10146 right = 0;
10147
Victor Stinnerc4b49542011-12-11 22:44:26 +010010148 if (left == 0 && right == 0)
10149 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010151 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10152 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010153 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10154 return NULL;
10155 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010157 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010159 if (!u)
10160 return NULL;
10161
10162 kind = PyUnicode_KIND(u);
10163 data = PyUnicode_DATA(u);
10164 if (left)
10165 FILL(kind, data, fill, 0, left);
10166 if (right)
10167 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010168 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010169 assert(_PyUnicode_CheckConsistency(u, 1));
10170 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010171}
10172
Alexander Belopolsky40018472011-02-26 01:02:56 +000010173PyObject *
10174PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010176 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010177
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010178 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010179 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180
Benjamin Petersonead6b532011-12-20 17:23:42 -060010181 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010183 if (PyUnicode_IS_ASCII(string))
10184 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010185 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010186 PyUnicode_GET_LENGTH(string), keepends);
10187 else
10188 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010189 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010190 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 break;
10192 case PyUnicode_2BYTE_KIND:
10193 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010194 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010195 PyUnicode_GET_LENGTH(string), keepends);
10196 break;
10197 case PyUnicode_4BYTE_KIND:
10198 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010199 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 PyUnicode_GET_LENGTH(string), keepends);
10201 break;
10202 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010203 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010204 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010205 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010206}
10207
Alexander Belopolsky40018472011-02-26 01:02:56 +000010208static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010209split(PyObject *self,
10210 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010211 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010212{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010213 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214 void *buf1, *buf2;
10215 Py_ssize_t len1, len2;
10216 PyObject* out;
10217
Guido van Rossumd57fd912000-03-10 22:53:23 +000010218 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010219 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010220
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010221 if (PyUnicode_READY(self) == -1)
10222 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010225 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010227 if (PyUnicode_IS_ASCII(self))
10228 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010229 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010230 PyUnicode_GET_LENGTH(self), maxcount
10231 );
10232 else
10233 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010234 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010235 PyUnicode_GET_LENGTH(self), maxcount
10236 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237 case PyUnicode_2BYTE_KIND:
10238 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010239 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010240 PyUnicode_GET_LENGTH(self), maxcount
10241 );
10242 case PyUnicode_4BYTE_KIND:
10243 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010244 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 PyUnicode_GET_LENGTH(self), maxcount
10246 );
10247 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010248 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 }
10250
10251 if (PyUnicode_READY(substring) == -1)
10252 return NULL;
10253
10254 kind1 = PyUnicode_KIND(self);
10255 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 len1 = PyUnicode_GET_LENGTH(self);
10257 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010258 if (kind1 < kind2 || len1 < len2) {
10259 out = PyList_New(1);
10260 if (out == NULL)
10261 return NULL;
10262 Py_INCREF(self);
10263 PyList_SET_ITEM(out, 0, self);
10264 return out;
10265 }
10266 buf1 = PyUnicode_DATA(self);
10267 buf2 = PyUnicode_DATA(substring);
10268 if (kind2 != kind1) {
10269 buf2 = _PyUnicode_AsKind(substring, kind1);
10270 if (!buf2)
10271 return NULL;
10272 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010273
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010274 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010276 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10277 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010278 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010279 else
10280 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010281 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 break;
10283 case PyUnicode_2BYTE_KIND:
10284 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010285 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 break;
10287 case PyUnicode_4BYTE_KIND:
10288 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010289 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 break;
10291 default:
10292 out = NULL;
10293 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010294 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010295 PyMem_Free(buf2);
10296 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010297}
10298
Alexander Belopolsky40018472011-02-26 01:02:56 +000010299static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010300rsplit(PyObject *self,
10301 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010302 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010303{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010304 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 void *buf1, *buf2;
10306 Py_ssize_t len1, len2;
10307 PyObject* out;
10308
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010309 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010310 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 if (PyUnicode_READY(self) == -1)
10313 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010314
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010316 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010318 if (PyUnicode_IS_ASCII(self))
10319 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010320 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010321 PyUnicode_GET_LENGTH(self), maxcount
10322 );
10323 else
10324 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010325 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010326 PyUnicode_GET_LENGTH(self), maxcount
10327 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 case PyUnicode_2BYTE_KIND:
10329 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010330 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010331 PyUnicode_GET_LENGTH(self), maxcount
10332 );
10333 case PyUnicode_4BYTE_KIND:
10334 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010335 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 PyUnicode_GET_LENGTH(self), maxcount
10337 );
10338 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010339 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 }
10341
10342 if (PyUnicode_READY(substring) == -1)
10343 return NULL;
10344
10345 kind1 = PyUnicode_KIND(self);
10346 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 len1 = PyUnicode_GET_LENGTH(self);
10348 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010349 if (kind1 < kind2 || len1 < len2) {
10350 out = PyList_New(1);
10351 if (out == NULL)
10352 return NULL;
10353 Py_INCREF(self);
10354 PyList_SET_ITEM(out, 0, self);
10355 return out;
10356 }
10357 buf1 = PyUnicode_DATA(self);
10358 buf2 = PyUnicode_DATA(substring);
10359 if (kind2 != kind1) {
10360 buf2 = _PyUnicode_AsKind(substring, kind1);
10361 if (!buf2)
10362 return NULL;
10363 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010365 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010367 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10368 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010369 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010370 else
10371 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010372 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010373 break;
10374 case PyUnicode_2BYTE_KIND:
10375 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010376 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 break;
10378 case PyUnicode_4BYTE_KIND:
10379 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010380 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010381 break;
10382 default:
10383 out = NULL;
10384 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010385 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010386 PyMem_Free(buf2);
10387 return out;
10388}
10389
10390static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010391anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10392 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010393{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010394 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010396 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10397 return asciilib_find(buf1, len1, buf2, len2, offset);
10398 else
10399 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400 case PyUnicode_2BYTE_KIND:
10401 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10402 case PyUnicode_4BYTE_KIND:
10403 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10404 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010405 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010406}
10407
10408static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010409anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10410 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010411{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010412 switch (kind) {
10413 case PyUnicode_1BYTE_KIND:
10414 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10415 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10416 else
10417 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10418 case PyUnicode_2BYTE_KIND:
10419 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10420 case PyUnicode_4BYTE_KIND:
10421 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10422 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010423 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010424}
10425
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010426static void
10427replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10428 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10429{
10430 int kind = PyUnicode_KIND(u);
10431 void *data = PyUnicode_DATA(u);
10432 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10433 if (kind == PyUnicode_1BYTE_KIND) {
10434 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10435 (Py_UCS1 *)data + len,
10436 u1, u2, maxcount);
10437 }
10438 else if (kind == PyUnicode_2BYTE_KIND) {
10439 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10440 (Py_UCS2 *)data + len,
10441 u1, u2, maxcount);
10442 }
10443 else {
10444 assert(kind == PyUnicode_4BYTE_KIND);
10445 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10446 (Py_UCS4 *)data + len,
10447 u1, u2, maxcount);
10448 }
10449}
10450
Alexander Belopolsky40018472011-02-26 01:02:56 +000010451static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452replace(PyObject *self, PyObject *str1,
10453 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010454{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010455 PyObject *u;
10456 char *sbuf = PyUnicode_DATA(self);
10457 char *buf1 = PyUnicode_DATA(str1);
10458 char *buf2 = PyUnicode_DATA(str2);
10459 int srelease = 0, release1 = 0, release2 = 0;
10460 int skind = PyUnicode_KIND(self);
10461 int kind1 = PyUnicode_KIND(str1);
10462 int kind2 = PyUnicode_KIND(str2);
10463 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10464 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10465 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010466 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010467 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010468
10469 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010470 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010472 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010473
Victor Stinner59de0ee2011-10-07 10:01:28 +020010474 if (str1 == str2)
10475 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476
Victor Stinner49a0a212011-10-12 23:46:10 +020010477 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010478 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10479 if (maxchar < maxchar_str1)
10480 /* substring too wide to be present */
10481 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010482 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10483 /* Replacing str1 with str2 may cause a maxchar reduction in the
10484 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010485 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010486 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010487
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010488 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010489 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010491 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010492 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010493 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010494 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010495 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010496
Victor Stinner69ed0f42013-04-09 21:48:24 +020010497 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010498 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010499 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010500 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010501 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010502 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010503 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010504 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010505
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010506 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10507 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010508 }
10509 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 int rkind = skind;
10511 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010512 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 if (kind1 < rkind) {
10515 /* widen substring */
10516 buf1 = _PyUnicode_AsKind(str1, rkind);
10517 if (!buf1) goto error;
10518 release1 = 1;
10519 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010520 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010521 if (i < 0)
10522 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 if (rkind > kind2) {
10524 /* widen replacement */
10525 buf2 = _PyUnicode_AsKind(str2, rkind);
10526 if (!buf2) goto error;
10527 release2 = 1;
10528 }
10529 else if (rkind < kind2) {
10530 /* widen self and buf1 */
10531 rkind = kind2;
10532 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010533 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 sbuf = _PyUnicode_AsKind(self, rkind);
10535 if (!sbuf) goto error;
10536 srelease = 1;
10537 buf1 = _PyUnicode_AsKind(str1, rkind);
10538 if (!buf1) goto error;
10539 release1 = 1;
10540 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010541 u = PyUnicode_New(slen, maxchar);
10542 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010544 assert(PyUnicode_KIND(u) == rkind);
10545 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010546
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010547 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010548 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010549 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010550 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010551 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010552 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010553
10554 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010555 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010556 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010557 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010558 if (i == -1)
10559 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010560 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010562 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010563 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010564 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010565 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010566 }
10567 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010568 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010569 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 int rkind = skind;
10571 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010572
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010574 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 buf1 = _PyUnicode_AsKind(str1, rkind);
10576 if (!buf1) goto error;
10577 release1 = 1;
10578 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010579 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010580 if (n == 0)
10581 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010583 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 buf2 = _PyUnicode_AsKind(str2, rkind);
10585 if (!buf2) goto error;
10586 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010587 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010588 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010589 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010590 rkind = kind2;
10591 sbuf = _PyUnicode_AsKind(self, rkind);
10592 if (!sbuf) goto error;
10593 srelease = 1;
10594 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010595 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 buf1 = _PyUnicode_AsKind(str1, rkind);
10597 if (!buf1) goto error;
10598 release1 = 1;
10599 }
10600 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10601 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010602 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 PyErr_SetString(PyExc_OverflowError,
10604 "replace string is too long");
10605 goto error;
10606 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010607 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010608 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010609 _Py_INCREF_UNICODE_EMPTY();
10610 if (!unicode_empty)
10611 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010612 u = unicode_empty;
10613 goto done;
10614 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010615 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 PyErr_SetString(PyExc_OverflowError,
10617 "replace string is too long");
10618 goto error;
10619 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010620 u = PyUnicode_New(new_size, maxchar);
10621 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010623 assert(PyUnicode_KIND(u) == rkind);
10624 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 ires = i = 0;
10626 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010627 while (n-- > 0) {
10628 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010629 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010630 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010631 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010632 if (j == -1)
10633 break;
10634 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010635 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010636 memcpy(res + rkind * ires,
10637 sbuf + rkind * i,
10638 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010640 }
10641 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010643 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010645 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010647 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010649 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010650 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010651 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010652 memcpy(res + rkind * ires,
10653 sbuf + rkind * i,
10654 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010655 }
10656 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010657 /* interleave */
10658 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010659 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010661 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010663 if (--n <= 0)
10664 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010665 memcpy(res + rkind * ires,
10666 sbuf + rkind * i,
10667 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 ires++;
10669 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010670 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010671 memcpy(res + rkind * ires,
10672 sbuf + rkind * i,
10673 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010674 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010675 }
10676
10677 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010678 unicode_adjust_maxchar(&u);
10679 if (u == NULL)
10680 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010681 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010682
10683 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 if (srelease)
10685 PyMem_FREE(sbuf);
10686 if (release1)
10687 PyMem_FREE(buf1);
10688 if (release2)
10689 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010690 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010691 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010692
Benjamin Peterson29060642009-01-31 22:14:21 +000010693 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010694 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 if (srelease)
10696 PyMem_FREE(sbuf);
10697 if (release1)
10698 PyMem_FREE(buf1);
10699 if (release2)
10700 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010701 return unicode_result_unchanged(self);
10702
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010703 error:
10704 if (srelease && sbuf)
10705 PyMem_FREE(sbuf);
10706 if (release1 && buf1)
10707 PyMem_FREE(buf1);
10708 if (release2 && buf2)
10709 PyMem_FREE(buf2);
10710 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711}
10712
10713/* --- Unicode Object Methods --------------------------------------------- */
10714
INADA Naoki3ae20562017-01-16 20:41:20 +090010715/*[clinic input]
10716str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717
INADA Naoki3ae20562017-01-16 20:41:20 +090010718Return a version of the string where each word is titlecased.
10719
10720More specifically, words start with uppercased characters and all remaining
10721cased characters have lower case.
10722[clinic start generated code]*/
10723
10724static PyObject *
10725unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010726/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010727{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010728 if (PyUnicode_READY(self) == -1)
10729 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010730 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010731}
10732
INADA Naoki3ae20562017-01-16 20:41:20 +090010733/*[clinic input]
10734str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010735
INADA Naoki3ae20562017-01-16 20:41:20 +090010736Return a capitalized version of the string.
10737
10738More specifically, make the first character have upper case and the rest lower
10739case.
10740[clinic start generated code]*/
10741
10742static PyObject *
10743unicode_capitalize_impl(PyObject *self)
10744/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010745{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010746 if (PyUnicode_READY(self) == -1)
10747 return NULL;
10748 if (PyUnicode_GET_LENGTH(self) == 0)
10749 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010750 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010751}
10752
INADA Naoki3ae20562017-01-16 20:41:20 +090010753/*[clinic input]
10754str.casefold as unicode_casefold
10755
10756Return a version of the string suitable for caseless comparisons.
10757[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010758
10759static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010760unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010761/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010762{
10763 if (PyUnicode_READY(self) == -1)
10764 return NULL;
10765 if (PyUnicode_IS_ASCII(self))
10766 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010767 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010768}
10769
10770
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010771/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010772
10773static int
10774convert_uc(PyObject *obj, void *addr)
10775{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010777
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010778 if (!PyUnicode_Check(obj)) {
10779 PyErr_Format(PyExc_TypeError,
10780 "The fill character must be a unicode character, "
10781 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010782 return 0;
10783 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010784 if (PyUnicode_READY(obj) < 0)
10785 return 0;
10786 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010787 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010788 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010789 return 0;
10790 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010791 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010792 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010793}
10794
INADA Naoki3ae20562017-01-16 20:41:20 +090010795/*[clinic input]
10796str.center as unicode_center
10797
10798 width: Py_ssize_t
10799 fillchar: Py_UCS4 = ' '
10800 /
10801
10802Return a centered string of length width.
10803
10804Padding is done using the specified fill character (default is a space).
10805[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010806
10807static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010808unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10809/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010810{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010811 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010812
Benjamin Petersonbac79492012-01-14 13:34:47 -050010813 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010814 return NULL;
10815
Victor Stinnerc4b49542011-12-11 22:44:26 +010010816 if (PyUnicode_GET_LENGTH(self) >= width)
10817 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010818
Victor Stinnerc4b49542011-12-11 22:44:26 +010010819 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010820 left = marg / 2 + (marg & width & 1);
10821
Victor Stinner9310abb2011-10-05 00:59:23 +020010822 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010823}
10824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010825/* This function assumes that str1 and str2 are readied by the caller. */
10826
Marc-André Lemburge5034372000-08-08 08:04:29 +000010827static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010828unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010829{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010830#define COMPARE(TYPE1, TYPE2) \
10831 do { \
10832 TYPE1* p1 = (TYPE1 *)data1; \
10833 TYPE2* p2 = (TYPE2 *)data2; \
10834 TYPE1* end = p1 + len; \
10835 Py_UCS4 c1, c2; \
10836 for (; p1 != end; p1++, p2++) { \
10837 c1 = *p1; \
10838 c2 = *p2; \
10839 if (c1 != c2) \
10840 return (c1 < c2) ? -1 : 1; \
10841 } \
10842 } \
10843 while (0)
10844
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010845 int kind1, kind2;
10846 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010847 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010848
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010849 kind1 = PyUnicode_KIND(str1);
10850 kind2 = PyUnicode_KIND(str2);
10851 data1 = PyUnicode_DATA(str1);
10852 data2 = PyUnicode_DATA(str2);
10853 len1 = PyUnicode_GET_LENGTH(str1);
10854 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010855 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010856
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010857 switch(kind1) {
10858 case PyUnicode_1BYTE_KIND:
10859 {
10860 switch(kind2) {
10861 case PyUnicode_1BYTE_KIND:
10862 {
10863 int cmp = memcmp(data1, data2, len);
10864 /* normalize result of memcmp() into the range [-1; 1] */
10865 if (cmp < 0)
10866 return -1;
10867 if (cmp > 0)
10868 return 1;
10869 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010870 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010871 case PyUnicode_2BYTE_KIND:
10872 COMPARE(Py_UCS1, Py_UCS2);
10873 break;
10874 case PyUnicode_4BYTE_KIND:
10875 COMPARE(Py_UCS1, Py_UCS4);
10876 break;
10877 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010878 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010879 }
10880 break;
10881 }
10882 case PyUnicode_2BYTE_KIND:
10883 {
10884 switch(kind2) {
10885 case PyUnicode_1BYTE_KIND:
10886 COMPARE(Py_UCS2, Py_UCS1);
10887 break;
10888 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010889 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010890 COMPARE(Py_UCS2, Py_UCS2);
10891 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010892 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010893 case PyUnicode_4BYTE_KIND:
10894 COMPARE(Py_UCS2, Py_UCS4);
10895 break;
10896 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010897 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010898 }
10899 break;
10900 }
10901 case PyUnicode_4BYTE_KIND:
10902 {
10903 switch(kind2) {
10904 case PyUnicode_1BYTE_KIND:
10905 COMPARE(Py_UCS4, Py_UCS1);
10906 break;
10907 case PyUnicode_2BYTE_KIND:
10908 COMPARE(Py_UCS4, Py_UCS2);
10909 break;
10910 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010911 {
10912#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10913 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10914 /* normalize result of wmemcmp() into the range [-1; 1] */
10915 if (cmp < 0)
10916 return -1;
10917 if (cmp > 0)
10918 return 1;
10919#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010920 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010921#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010922 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010923 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010924 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010925 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010926 }
10927 break;
10928 }
10929 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010930 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000010931 }
10932
Victor Stinner770e19e2012-10-04 22:59:45 +020010933 if (len1 == len2)
10934 return 0;
10935 if (len1 < len2)
10936 return -1;
10937 else
10938 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010939
10940#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010941}
10942
Benjamin Peterson621b4302016-09-09 13:54:34 -070010943static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010944unicode_compare_eq(PyObject *str1, PyObject *str2)
10945{
10946 int kind;
10947 void *data1, *data2;
10948 Py_ssize_t len;
10949 int cmp;
10950
Victor Stinnere5567ad2012-10-23 02:48:49 +020010951 len = PyUnicode_GET_LENGTH(str1);
10952 if (PyUnicode_GET_LENGTH(str2) != len)
10953 return 0;
10954 kind = PyUnicode_KIND(str1);
10955 if (PyUnicode_KIND(str2) != kind)
10956 return 0;
10957 data1 = PyUnicode_DATA(str1);
10958 data2 = PyUnicode_DATA(str2);
10959
10960 cmp = memcmp(data1, data2, len * kind);
10961 return (cmp == 0);
10962}
10963
10964
Alexander Belopolsky40018472011-02-26 01:02:56 +000010965int
10966PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010968 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10969 if (PyUnicode_READY(left) == -1 ||
10970 PyUnicode_READY(right) == -1)
10971 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010972
10973 /* a string is equal to itself */
10974 if (left == right)
10975 return 0;
10976
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010977 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010978 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010979 PyErr_Format(PyExc_TypeError,
10980 "Can't compare %.100s and %.100s",
10981 left->ob_type->tp_name,
10982 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983 return -1;
10984}
10985
Martin v. Löwis5b222132007-06-10 09:51:05 +000010986int
10987PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10988{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010989 Py_ssize_t i;
10990 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010991 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010992 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010993
Victor Stinner910337b2011-10-03 03:20:16 +020010994 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010995 if (!PyUnicode_IS_READY(uni)) {
10996 const wchar_t *ws = _PyUnicode_WSTR(uni);
10997 /* Compare Unicode string and source character set string */
10998 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
10999 if (chr != ustr[i])
11000 return (chr < ustr[i]) ? -1 : 1;
11001 }
11002 /* This check keeps Python strings that end in '\0' from comparing equal
11003 to C strings identical up to that point. */
11004 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11005 return 1; /* uni is longer */
11006 if (ustr[i])
11007 return -1; /* str is longer */
11008 return 0;
11009 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011010 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011011 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011012 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011013 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011014 size_t len, len2 = strlen(str);
11015 int cmp;
11016
11017 len = Py_MIN(len1, len2);
11018 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011019 if (cmp != 0) {
11020 if (cmp < 0)
11021 return -1;
11022 else
11023 return 1;
11024 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011025 if (len1 > len2)
11026 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011027 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011028 return -1; /* str is longer */
11029 return 0;
11030 }
11031 else {
11032 void *data = PyUnicode_DATA(uni);
11033 /* Compare Unicode string and source character set string */
11034 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011035 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011036 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11037 /* This check keeps Python strings that end in '\0' from comparing equal
11038 to C strings identical up to that point. */
11039 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11040 return 1; /* uni is longer */
11041 if (str[i])
11042 return -1; /* str is longer */
11043 return 0;
11044 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011045}
11046
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011047static int
11048non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11049{
11050 size_t i, len;
11051 const wchar_t *p;
11052 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11053 if (strlen(str) != len)
11054 return 0;
11055 p = _PyUnicode_WSTR(unicode);
11056 assert(p);
11057 for (i = 0; i < len; i++) {
11058 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011059 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011060 return 0;
11061 }
11062 return 1;
11063}
11064
11065int
11066_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11067{
11068 size_t len;
11069 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011070 assert(str);
11071#ifndef NDEBUG
11072 for (const char *p = str; *p; p++) {
11073 assert((unsigned char)*p < 128);
11074 }
11075#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011076 if (PyUnicode_READY(unicode) == -1) {
11077 /* Memory error or bad data */
11078 PyErr_Clear();
11079 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11080 }
11081 if (!PyUnicode_IS_ASCII(unicode))
11082 return 0;
11083 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11084 return strlen(str) == len &&
11085 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11086}
11087
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011088int
11089_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11090{
11091 PyObject *right_uni;
11092 Py_hash_t hash;
11093
11094 assert(_PyUnicode_CHECK(left));
11095 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011096#ifndef NDEBUG
11097 for (const char *p = right->string; *p; p++) {
11098 assert((unsigned char)*p < 128);
11099 }
11100#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011101
11102 if (PyUnicode_READY(left) == -1) {
11103 /* memory error or bad data */
11104 PyErr_Clear();
11105 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11106 }
11107
11108 if (!PyUnicode_IS_ASCII(left))
11109 return 0;
11110
11111 right_uni = _PyUnicode_FromId(right); /* borrowed */
11112 if (right_uni == NULL) {
11113 /* memory error or bad data */
11114 PyErr_Clear();
11115 return _PyUnicode_EqualToASCIIString(left, right->string);
11116 }
11117
11118 if (left == right_uni)
11119 return 1;
11120
11121 if (PyUnicode_CHECK_INTERNED(left))
11122 return 0;
11123
INADA Naoki7cc95f52018-01-28 02:07:09 +090011124 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011125 hash = _PyUnicode_HASH(left);
11126 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11127 return 0;
11128
11129 return unicode_compare_eq(left, right_uni);
11130}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011131
Alexander Belopolsky40018472011-02-26 01:02:56 +000011132PyObject *
11133PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011134{
11135 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011136
Victor Stinnere5567ad2012-10-23 02:48:49 +020011137 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11138 Py_RETURN_NOTIMPLEMENTED;
11139
11140 if (PyUnicode_READY(left) == -1 ||
11141 PyUnicode_READY(right) == -1)
11142 return NULL;
11143
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011144 if (left == right) {
11145 switch (op) {
11146 case Py_EQ:
11147 case Py_LE:
11148 case Py_GE:
11149 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011150 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011151 case Py_NE:
11152 case Py_LT:
11153 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011154 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011155 default:
11156 PyErr_BadArgument();
11157 return NULL;
11158 }
11159 }
11160 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011161 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011162 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011163 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011164 }
11165 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011166 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011167 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011168 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011169}
11170
Alexander Belopolsky40018472011-02-26 01:02:56 +000011171int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011172_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11173{
11174 return unicode_eq(aa, bb);
11175}
11176
11177int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011178PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011179{
Victor Stinner77282cb2013-04-14 19:22:47 +020011180 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011181 void *buf1, *buf2;
11182 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011183 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011184
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011185 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011186 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011187 "'in <string>' requires string as left operand, not %.100s",
11188 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011189 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011190 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011191 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011192 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011193 if (ensure_unicode(str) < 0)
11194 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011195
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011196 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011197 kind2 = PyUnicode_KIND(substr);
11198 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011199 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011200 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011201 len2 = PyUnicode_GET_LENGTH(substr);
11202 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011203 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011204 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011205 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011206 if (len2 == 1) {
11207 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11208 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011209 return result;
11210 }
11211 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011212 buf2 = _PyUnicode_AsKind(substr, kind1);
11213 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011214 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011215 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011216
Victor Stinner77282cb2013-04-14 19:22:47 +020011217 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011218 case PyUnicode_1BYTE_KIND:
11219 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11220 break;
11221 case PyUnicode_2BYTE_KIND:
11222 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11223 break;
11224 case PyUnicode_4BYTE_KIND:
11225 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11226 break;
11227 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011228 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011229 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011230
Victor Stinner77282cb2013-04-14 19:22:47 +020011231 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011232 PyMem_Free(buf2);
11233
Guido van Rossum403d68b2000-03-13 15:55:09 +000011234 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011235}
11236
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237/* Concat to string or Unicode object giving a new Unicode object. */
11238
Alexander Belopolsky40018472011-02-26 01:02:56 +000011239PyObject *
11240PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011242 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011243 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011244 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011246 if (ensure_unicode(left) < 0)
11247 return NULL;
11248
11249 if (!PyUnicode_Check(right)) {
11250 PyErr_Format(PyExc_TypeError,
11251 "can only concatenate str (not \"%.200s\") to str",
11252 right->ob_type->tp_name);
11253 return NULL;
11254 }
11255 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011256 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257
11258 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011259 if (left == unicode_empty)
11260 return PyUnicode_FromObject(right);
11261 if (right == unicode_empty)
11262 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011263
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011264 left_len = PyUnicode_GET_LENGTH(left);
11265 right_len = PyUnicode_GET_LENGTH(right);
11266 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011267 PyErr_SetString(PyExc_OverflowError,
11268 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011269 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011270 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011271 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011272
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011273 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11274 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011275 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011276
Guido van Rossumd57fd912000-03-10 22:53:23 +000011277 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011278 result = PyUnicode_New(new_len, maxchar);
11279 if (result == NULL)
11280 return NULL;
11281 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11282 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11283 assert(_PyUnicode_CheckConsistency(result, 1));
11284 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011285}
11286
Walter Dörwald1ab83302007-05-18 17:15:44 +000011287void
Victor Stinner23e56682011-10-03 03:54:37 +020011288PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011289{
Victor Stinner23e56682011-10-03 03:54:37 +020011290 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011291 Py_UCS4 maxchar, maxchar2;
11292 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011293
11294 if (p_left == NULL) {
11295 if (!PyErr_Occurred())
11296 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011297 return;
11298 }
Victor Stinner23e56682011-10-03 03:54:37 +020011299 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011300 if (right == NULL || left == NULL
11301 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011302 if (!PyErr_Occurred())
11303 PyErr_BadInternalCall();
11304 goto error;
11305 }
11306
Benjamin Petersonbac79492012-01-14 13:34:47 -050011307 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011308 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011309 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011310 goto error;
11311
Victor Stinner488fa492011-12-12 00:01:39 +010011312 /* Shortcuts */
11313 if (left == unicode_empty) {
11314 Py_DECREF(left);
11315 Py_INCREF(right);
11316 *p_left = right;
11317 return;
11318 }
11319 if (right == unicode_empty)
11320 return;
11321
11322 left_len = PyUnicode_GET_LENGTH(left);
11323 right_len = PyUnicode_GET_LENGTH(right);
11324 if (left_len > PY_SSIZE_T_MAX - right_len) {
11325 PyErr_SetString(PyExc_OverflowError,
11326 "strings are too large to concat");
11327 goto error;
11328 }
11329 new_len = left_len + right_len;
11330
11331 if (unicode_modifiable(left)
11332 && PyUnicode_CheckExact(right)
11333 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011334 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11335 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011336 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011337 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011338 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11339 {
11340 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011341 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011342 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011343
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011344 /* copy 'right' into the newly allocated area of 'left' */
11345 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011346 }
Victor Stinner488fa492011-12-12 00:01:39 +010011347 else {
11348 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11349 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011350 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011351
Victor Stinner488fa492011-12-12 00:01:39 +010011352 /* Concat the two Unicode strings */
11353 res = PyUnicode_New(new_len, maxchar);
11354 if (res == NULL)
11355 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011356 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11357 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011358 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011359 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011360 }
11361 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011362 return;
11363
11364error:
Victor Stinner488fa492011-12-12 00:01:39 +010011365 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011366}
11367
11368void
11369PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11370{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011371 PyUnicode_Append(pleft, right);
11372 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011373}
11374
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011375/*
11376Wraps stringlib_parse_args_finds() and additionally ensures that the
11377first argument is a unicode object.
11378*/
11379
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011380static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011381parse_args_finds_unicode(const char * function_name, PyObject *args,
11382 PyObject **substring,
11383 Py_ssize_t *start, Py_ssize_t *end)
11384{
11385 if(stringlib_parse_args_finds(function_name, args, substring,
11386 start, end)) {
11387 if (ensure_unicode(*substring) < 0)
11388 return 0;
11389 return 1;
11390 }
11391 return 0;
11392}
11393
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011394PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011395 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011396\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011397Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011398string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011399interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011400
11401static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011402unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011403{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011404 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011405 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011406 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011408 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011409 void *buf1, *buf2;
11410 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011411
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011412 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011413 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011415 kind1 = PyUnicode_KIND(self);
11416 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011417 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011418 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011419
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011420 len1 = PyUnicode_GET_LENGTH(self);
11421 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011422 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011423 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011424 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011425
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011426 buf1 = PyUnicode_DATA(self);
11427 buf2 = PyUnicode_DATA(substring);
11428 if (kind2 != kind1) {
11429 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011430 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011431 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011432 }
11433 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011434 case PyUnicode_1BYTE_KIND:
11435 iresult = ucs1lib_count(
11436 ((Py_UCS1*)buf1) + start, end - start,
11437 buf2, len2, PY_SSIZE_T_MAX
11438 );
11439 break;
11440 case PyUnicode_2BYTE_KIND:
11441 iresult = ucs2lib_count(
11442 ((Py_UCS2*)buf1) + start, end - start,
11443 buf2, len2, PY_SSIZE_T_MAX
11444 );
11445 break;
11446 case PyUnicode_4BYTE_KIND:
11447 iresult = ucs4lib_count(
11448 ((Py_UCS4*)buf1) + start, end - start,
11449 buf2, len2, PY_SSIZE_T_MAX
11450 );
11451 break;
11452 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011453 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011454 }
11455
11456 result = PyLong_FromSsize_t(iresult);
11457
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011458 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011459 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461 return result;
11462}
11463
INADA Naoki3ae20562017-01-16 20:41:20 +090011464/*[clinic input]
11465str.encode as unicode_encode
11466
11467 encoding: str(c_default="NULL") = 'utf-8'
11468 The encoding in which to encode the string.
11469 errors: str(c_default="NULL") = 'strict'
11470 The error handling scheme to use for encoding errors.
11471 The default is 'strict' meaning that encoding errors raise a
11472 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11473 'xmlcharrefreplace' as well as any other name registered with
11474 codecs.register_error that can handle UnicodeEncodeErrors.
11475
11476Encode the string using the codec registered for encoding.
11477[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478
11479static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011480unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011481/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011483 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011484}
11485
INADA Naoki3ae20562017-01-16 20:41:20 +090011486/*[clinic input]
11487str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488
INADA Naoki3ae20562017-01-16 20:41:20 +090011489 tabsize: int = 8
11490
11491Return a copy where all tab characters are expanded using spaces.
11492
11493If tabsize is not given, a tab size of 8 characters is assumed.
11494[clinic start generated code]*/
11495
11496static PyObject *
11497unicode_expandtabs_impl(PyObject *self, int tabsize)
11498/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011500 Py_ssize_t i, j, line_pos, src_len, incr;
11501 Py_UCS4 ch;
11502 PyObject *u;
11503 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011504 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011505 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506
Antoine Pitrou22425222011-10-04 19:10:51 +020011507 if (PyUnicode_READY(self) == -1)
11508 return NULL;
11509
Thomas Wouters7e474022000-07-16 12:04:32 +000011510 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011511 src_len = PyUnicode_GET_LENGTH(self);
11512 i = j = line_pos = 0;
11513 kind = PyUnicode_KIND(self);
11514 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011515 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011516 for (; i < src_len; i++) {
11517 ch = PyUnicode_READ(kind, src_data, i);
11518 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011519 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011520 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011521 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011522 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011523 goto overflow;
11524 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011525 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011526 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011527 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011529 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011530 goto overflow;
11531 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011533 if (ch == '\n' || ch == '\r')
11534 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011536 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011537 if (!found)
11538 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011539
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011541 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542 if (!u)
11543 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011544 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011545
Antoine Pitroue71d5742011-10-04 15:55:09 +020011546 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547
Antoine Pitroue71d5742011-10-04 15:55:09 +020011548 for (; i < src_len; i++) {
11549 ch = PyUnicode_READ(kind, src_data, i);
11550 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011551 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011552 incr = tabsize - (line_pos % tabsize);
11553 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011554 FILL(kind, dest_data, ' ', j, incr);
11555 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011556 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011557 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011558 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011559 line_pos++;
11560 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011561 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011562 if (ch == '\n' || ch == '\r')
11563 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011565 }
11566 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011567 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011568
Antoine Pitroue71d5742011-10-04 15:55:09 +020011569 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011570 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11571 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572}
11573
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011574PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011575 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011576\n\
11577Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011578such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011579arguments start and end are interpreted as in slice notation.\n\
11580\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011581Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582
11583static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011584unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011586 /* initialize variables to prevent gcc warning */
11587 PyObject *substring = NULL;
11588 Py_ssize_t start = 0;
11589 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011590 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011591
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011592 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011594
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011595 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011596 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011597
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011598 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011599
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011600 if (result == -2)
11601 return NULL;
11602
Christian Heimes217cfd12007-12-02 14:31:20 +000011603 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604}
11605
11606static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011607unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011609 void *data;
11610 enum PyUnicode_Kind kind;
11611 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011612
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011613 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011614 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011615 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011616 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011617 if (PyUnicode_READY(self) == -1) {
11618 return NULL;
11619 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011620 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11621 PyErr_SetString(PyExc_IndexError, "string index out of range");
11622 return NULL;
11623 }
11624 kind = PyUnicode_KIND(self);
11625 data = PyUnicode_DATA(self);
11626 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011627 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011628}
11629
Guido van Rossumc2504932007-09-18 19:42:40 +000011630/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011631 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011632static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011633unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634{
Guido van Rossumc2504932007-09-18 19:42:40 +000011635 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011636 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011637
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011638#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011639 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011640#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011641 if (_PyUnicode_HASH(self) != -1)
11642 return _PyUnicode_HASH(self);
11643 if (PyUnicode_READY(self) == -1)
11644 return -1;
11645 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011646 /*
11647 We make the hash of the empty string be 0, rather than using
11648 (prefix ^ suffix), since this slightly obfuscates the hash secret
11649 */
11650 if (len == 0) {
11651 _PyUnicode_HASH(self) = 0;
11652 return 0;
11653 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011654 x = _Py_HashBytes(PyUnicode_DATA(self),
11655 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011656 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011657 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011658}
11659
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011660PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011661 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011662\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011663Return the lowest index in S where substring sub is found, \n\
11664such that sub is contained within S[start:end]. Optional\n\
11665arguments start and end are interpreted as in slice notation.\n\
11666\n\
11667Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011668
11669static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011670unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011671{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011672 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011673 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011674 PyObject *substring = NULL;
11675 Py_ssize_t start = 0;
11676 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011677
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011678 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011679 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011680
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011681 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011682 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011683
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011684 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011685
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011686 if (result == -2)
11687 return NULL;
11688
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689 if (result < 0) {
11690 PyErr_SetString(PyExc_ValueError, "substring not found");
11691 return NULL;
11692 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011693
Christian Heimes217cfd12007-12-02 14:31:20 +000011694 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695}
11696
INADA Naoki3ae20562017-01-16 20:41:20 +090011697/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011698str.isascii as unicode_isascii
11699
11700Return True if all characters in the string are ASCII, False otherwise.
11701
11702ASCII characters have code points in the range U+0000-U+007F.
11703Empty string is ASCII too.
11704[clinic start generated code]*/
11705
11706static PyObject *
11707unicode_isascii_impl(PyObject *self)
11708/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11709{
11710 if (PyUnicode_READY(self) == -1) {
11711 return NULL;
11712 }
11713 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11714}
11715
11716/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011717str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011718
INADA Naoki3ae20562017-01-16 20:41:20 +090011719Return True if the string is a lowercase string, False otherwise.
11720
11721A string is lowercase if all cased characters in the string are lowercase and
11722there is at least one cased character in the string.
11723[clinic start generated code]*/
11724
11725static PyObject *
11726unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011727/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011729 Py_ssize_t i, length;
11730 int kind;
11731 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732 int cased;
11733
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734 if (PyUnicode_READY(self) == -1)
11735 return NULL;
11736 length = PyUnicode_GET_LENGTH(self);
11737 kind = PyUnicode_KIND(self);
11738 data = PyUnicode_DATA(self);
11739
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011741 if (length == 1)
11742 return PyBool_FromLong(
11743 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011745 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011746 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011747 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011748
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011750 for (i = 0; i < length; i++) {
11751 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011752
Benjamin Peterson29060642009-01-31 22:14:21 +000011753 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011754 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011755 else if (!cased && Py_UNICODE_ISLOWER(ch))
11756 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011758 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759}
11760
INADA Naoki3ae20562017-01-16 20:41:20 +090011761/*[clinic input]
11762str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763
INADA Naoki3ae20562017-01-16 20:41:20 +090011764Return True if the string is an uppercase string, False otherwise.
11765
11766A string is uppercase if all cased characters in the string are uppercase and
11767there is at least one cased character in the string.
11768[clinic start generated code]*/
11769
11770static PyObject *
11771unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011772/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011774 Py_ssize_t i, length;
11775 int kind;
11776 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777 int cased;
11778
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011779 if (PyUnicode_READY(self) == -1)
11780 return NULL;
11781 length = PyUnicode_GET_LENGTH(self);
11782 kind = PyUnicode_KIND(self);
11783 data = PyUnicode_DATA(self);
11784
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011786 if (length == 1)
11787 return PyBool_FromLong(
11788 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011790 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011791 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011792 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011793
Guido van Rossumd57fd912000-03-10 22:53:23 +000011794 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011795 for (i = 0; i < length; i++) {
11796 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011797
Benjamin Peterson29060642009-01-31 22:14:21 +000011798 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011799 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011800 else if (!cased && Py_UNICODE_ISUPPER(ch))
11801 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011803 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804}
11805
INADA Naoki3ae20562017-01-16 20:41:20 +090011806/*[clinic input]
11807str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808
INADA Naoki3ae20562017-01-16 20:41:20 +090011809Return True if the string is a title-cased string, False otherwise.
11810
11811In a title-cased string, upper- and title-case characters may only
11812follow uncased characters and lowercase characters only cased ones.
11813[clinic start generated code]*/
11814
11815static PyObject *
11816unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011817/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011818{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011819 Py_ssize_t i, length;
11820 int kind;
11821 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822 int cased, previous_is_cased;
11823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011824 if (PyUnicode_READY(self) == -1)
11825 return NULL;
11826 length = PyUnicode_GET_LENGTH(self);
11827 kind = PyUnicode_KIND(self);
11828 data = PyUnicode_DATA(self);
11829
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011831 if (length == 1) {
11832 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11833 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11834 (Py_UNICODE_ISUPPER(ch) != 0));
11835 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011837 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011838 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011839 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011840
Guido van Rossumd57fd912000-03-10 22:53:23 +000011841 cased = 0;
11842 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011843 for (i = 0; i < length; i++) {
11844 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011845
Benjamin Peterson29060642009-01-31 22:14:21 +000011846 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11847 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011848 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011849 previous_is_cased = 1;
11850 cased = 1;
11851 }
11852 else if (Py_UNICODE_ISLOWER(ch)) {
11853 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011854 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011855 previous_is_cased = 1;
11856 cased = 1;
11857 }
11858 else
11859 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011860 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011861 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011862}
11863
INADA Naoki3ae20562017-01-16 20:41:20 +090011864/*[clinic input]
11865str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866
INADA Naoki3ae20562017-01-16 20:41:20 +090011867Return True if the string is a whitespace string, False otherwise.
11868
11869A string is whitespace if all characters in the string are whitespace and there
11870is at least one character in the string.
11871[clinic start generated code]*/
11872
11873static PyObject *
11874unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011875/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011876{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011877 Py_ssize_t i, length;
11878 int kind;
11879 void *data;
11880
11881 if (PyUnicode_READY(self) == -1)
11882 return NULL;
11883 length = PyUnicode_GET_LENGTH(self);
11884 kind = PyUnicode_KIND(self);
11885 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886
Guido van Rossumd57fd912000-03-10 22:53:23 +000011887 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011888 if (length == 1)
11889 return PyBool_FromLong(
11890 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011892 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011893 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011894 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011895
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011896 for (i = 0; i < length; i++) {
11897 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011898 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011899 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011901 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902}
11903
INADA Naoki3ae20562017-01-16 20:41:20 +090011904/*[clinic input]
11905str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011906
INADA Naoki3ae20562017-01-16 20:41:20 +090011907Return True if the string is an alphabetic string, False otherwise.
11908
11909A string is alphabetic if all characters in the string are alphabetic and there
11910is at least one character in the string.
11911[clinic start generated code]*/
11912
11913static PyObject *
11914unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011915/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011916{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011917 Py_ssize_t i, length;
11918 int kind;
11919 void *data;
11920
11921 if (PyUnicode_READY(self) == -1)
11922 return NULL;
11923 length = PyUnicode_GET_LENGTH(self);
11924 kind = PyUnicode_KIND(self);
11925 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011926
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011927 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011928 if (length == 1)
11929 return PyBool_FromLong(
11930 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011931
11932 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011934 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011936 for (i = 0; i < length; i++) {
11937 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011938 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011939 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011940 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011941}
11942
INADA Naoki3ae20562017-01-16 20:41:20 +090011943/*[clinic input]
11944str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011945
INADA Naoki3ae20562017-01-16 20:41:20 +090011946Return True if the string is an alpha-numeric string, False otherwise.
11947
11948A string is alpha-numeric if all characters in the string are alpha-numeric and
11949there is at least one character in the string.
11950[clinic start generated code]*/
11951
11952static PyObject *
11953unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011954/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011955{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011956 int kind;
11957 void *data;
11958 Py_ssize_t len, i;
11959
11960 if (PyUnicode_READY(self) == -1)
11961 return NULL;
11962
11963 kind = PyUnicode_KIND(self);
11964 data = PyUnicode_DATA(self);
11965 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011966
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011967 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011968 if (len == 1) {
11969 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11970 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11971 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011972
11973 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011975 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011977 for (i = 0; i < len; i++) {
11978 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011979 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011980 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011981 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011982 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011983}
11984
INADA Naoki3ae20562017-01-16 20:41:20 +090011985/*[clinic input]
11986str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000011987
INADA Naoki3ae20562017-01-16 20:41:20 +090011988Return True if the string is a decimal string, False otherwise.
11989
11990A string is a decimal string if all characters in the string are decimal and
11991there is at least one character in the string.
11992[clinic start generated code]*/
11993
11994static PyObject *
11995unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011996/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011998 Py_ssize_t i, length;
11999 int kind;
12000 void *data;
12001
12002 if (PyUnicode_READY(self) == -1)
12003 return NULL;
12004 length = PyUnicode_GET_LENGTH(self);
12005 kind = PyUnicode_KIND(self);
12006 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007
Guido van Rossumd57fd912000-03-10 22:53:23 +000012008 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012009 if (length == 1)
12010 return PyBool_FromLong(
12011 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012012
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012013 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012015 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012017 for (i = 0; i < length; i++) {
12018 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012019 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012020 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012021 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012022}
12023
INADA Naoki3ae20562017-01-16 20:41:20 +090012024/*[clinic input]
12025str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012026
INADA Naoki3ae20562017-01-16 20:41:20 +090012027Return True if the string is a digit string, False otherwise.
12028
12029A string is a digit string if all characters in the string are digits and there
12030is at least one character in the string.
12031[clinic start generated code]*/
12032
12033static PyObject *
12034unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012035/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012036{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012037 Py_ssize_t i, length;
12038 int kind;
12039 void *data;
12040
12041 if (PyUnicode_READY(self) == -1)
12042 return NULL;
12043 length = PyUnicode_GET_LENGTH(self);
12044 kind = PyUnicode_KIND(self);
12045 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012046
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 if (length == 1) {
12049 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12050 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012053 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012054 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012055 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012057 for (i = 0; i < length; i++) {
12058 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012059 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012060 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012061 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062}
12063
INADA Naoki3ae20562017-01-16 20:41:20 +090012064/*[clinic input]
12065str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066
INADA Naoki3ae20562017-01-16 20:41:20 +090012067Return True if the string is a numeric string, False otherwise.
12068
12069A string is numeric if all characters in the string are numeric and there is at
12070least one character in the string.
12071[clinic start generated code]*/
12072
12073static PyObject *
12074unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012075/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012077 Py_ssize_t i, length;
12078 int kind;
12079 void *data;
12080
12081 if (PyUnicode_READY(self) == -1)
12082 return NULL;
12083 length = PyUnicode_GET_LENGTH(self);
12084 kind = PyUnicode_KIND(self);
12085 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012088 if (length == 1)
12089 return PyBool_FromLong(
12090 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012091
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012092 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012093 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012094 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012096 for (i = 0; i < length; i++) {
12097 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012098 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012099 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012100 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012101}
12102
Martin v. Löwis47383402007-08-15 07:32:56 +000012103int
12104PyUnicode_IsIdentifier(PyObject *self)
12105{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012106 int kind;
12107 void *data;
12108 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012109 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012111 if (PyUnicode_READY(self) == -1) {
12112 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012113 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012114 }
12115
12116 /* Special case for empty strings */
12117 if (PyUnicode_GET_LENGTH(self) == 0)
12118 return 0;
12119 kind = PyUnicode_KIND(self);
12120 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012121
12122 /* PEP 3131 says that the first character must be in
12123 XID_Start and subsequent characters in XID_Continue,
12124 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012125 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012126 letters, digits, underscore). However, given the current
12127 definition of XID_Start and XID_Continue, it is sufficient
12128 to check just for these, except that _ must be allowed
12129 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012130 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012131 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012132 return 0;
12133
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012134 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012135 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012136 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012137 return 1;
12138}
12139
INADA Naoki3ae20562017-01-16 20:41:20 +090012140/*[clinic input]
12141str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012142
INADA Naoki3ae20562017-01-16 20:41:20 +090012143Return True if the string is a valid Python identifier, False otherwise.
12144
12145Use keyword.iskeyword() to test for reserved identifiers such as "def" and
12146"class".
12147[clinic start generated code]*/
12148
12149static PyObject *
12150unicode_isidentifier_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012151/*[clinic end generated code: output=fe585a9666572905 input=916b0a3c9f57e919]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012152{
12153 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12154}
12155
INADA Naoki3ae20562017-01-16 20:41:20 +090012156/*[clinic input]
12157str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012158
INADA Naoki3ae20562017-01-16 20:41:20 +090012159Return True if the string is printable, False otherwise.
12160
12161A string is printable if all of its characters are considered printable in
12162repr() or if it is empty.
12163[clinic start generated code]*/
12164
12165static PyObject *
12166unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012167/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012168{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012169 Py_ssize_t i, length;
12170 int kind;
12171 void *data;
12172
12173 if (PyUnicode_READY(self) == -1)
12174 return NULL;
12175 length = PyUnicode_GET_LENGTH(self);
12176 kind = PyUnicode_KIND(self);
12177 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012178
12179 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012180 if (length == 1)
12181 return PyBool_FromLong(
12182 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012183
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012184 for (i = 0; i < length; i++) {
12185 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012186 Py_RETURN_FALSE;
12187 }
12188 }
12189 Py_RETURN_TRUE;
12190}
12191
INADA Naoki3ae20562017-01-16 20:41:20 +090012192/*[clinic input]
12193str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194
INADA Naoki3ae20562017-01-16 20:41:20 +090012195 iterable: object
12196 /
12197
12198Concatenate any number of strings.
12199
Martin Panter91a88662017-01-24 00:30:06 +000012200The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012201The result is returned as a new string.
12202
12203Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12204[clinic start generated code]*/
12205
12206static PyObject *
12207unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012208/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012209{
INADA Naoki3ae20562017-01-16 20:41:20 +090012210 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012211}
12212
Martin v. Löwis18e16552006-02-15 17:27:45 +000012213static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012214unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012215{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012216 if (PyUnicode_READY(self) == -1)
12217 return -1;
12218 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219}
12220
INADA Naoki3ae20562017-01-16 20:41:20 +090012221/*[clinic input]
12222str.ljust as unicode_ljust
12223
12224 width: Py_ssize_t
12225 fillchar: Py_UCS4 = ' '
12226 /
12227
12228Return a left-justified string of length width.
12229
12230Padding is done using the specified fill character (default is a space).
12231[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232
12233static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012234unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12235/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012237 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012238 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012239
Victor Stinnerc4b49542011-12-11 22:44:26 +010012240 if (PyUnicode_GET_LENGTH(self) >= width)
12241 return unicode_result_unchanged(self);
12242
12243 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244}
12245
INADA Naoki3ae20562017-01-16 20:41:20 +090012246/*[clinic input]
12247str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012248
INADA Naoki3ae20562017-01-16 20:41:20 +090012249Return a copy of the string converted to lowercase.
12250[clinic start generated code]*/
12251
12252static PyObject *
12253unicode_lower_impl(PyObject *self)
12254/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012256 if (PyUnicode_READY(self) == -1)
12257 return NULL;
12258 if (PyUnicode_IS_ASCII(self))
12259 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012260 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261}
12262
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012263#define LEFTSTRIP 0
12264#define RIGHTSTRIP 1
12265#define BOTHSTRIP 2
12266
12267/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012268static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012269
INADA Naoki3ae20562017-01-16 20:41:20 +090012270#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012271
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012272/* externally visible for str.strip(unicode) */
12273PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012274_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012275{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012276 void *data;
12277 int kind;
12278 Py_ssize_t i, j, len;
12279 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012280 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012282 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12283 return NULL;
12284
12285 kind = PyUnicode_KIND(self);
12286 data = PyUnicode_DATA(self);
12287 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012288 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012289 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12290 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012291 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012292
Benjamin Peterson14339b62009-01-31 16:36:08 +000012293 i = 0;
12294 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012295 while (i < len) {
12296 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12297 if (!BLOOM(sepmask, ch))
12298 break;
12299 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12300 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012301 i++;
12302 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012303 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012304
Benjamin Peterson14339b62009-01-31 16:36:08 +000012305 j = len;
12306 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012307 j--;
12308 while (j >= i) {
12309 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12310 if (!BLOOM(sepmask, ch))
12311 break;
12312 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12313 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012314 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012315 }
12316
Benjamin Peterson29060642009-01-31 22:14:21 +000012317 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012318 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012319
Victor Stinner7931d9a2011-11-04 00:22:48 +010012320 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012321}
12322
12323PyObject*
12324PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12325{
12326 unsigned char *data;
12327 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012328 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012329
Victor Stinnerde636f32011-10-01 03:55:54 +020012330 if (PyUnicode_READY(self) == -1)
12331 return NULL;
12332
Victor Stinner684d5fd2012-05-03 02:32:34 +020012333 length = PyUnicode_GET_LENGTH(self);
12334 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012335
Victor Stinner684d5fd2012-05-03 02:32:34 +020012336 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012337 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012338
Victor Stinnerde636f32011-10-01 03:55:54 +020012339 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012340 PyErr_SetString(PyExc_IndexError, "string index out of range");
12341 return NULL;
12342 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012343 if (start >= length || end < start)
12344 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012345
Victor Stinner684d5fd2012-05-03 02:32:34 +020012346 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012347 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012348 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012349 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012350 }
12351 else {
12352 kind = PyUnicode_KIND(self);
12353 data = PyUnicode_1BYTE_DATA(self);
12354 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012355 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012356 length);
12357 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012358}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012359
12360static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012361do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012362{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012363 Py_ssize_t len, i, j;
12364
12365 if (PyUnicode_READY(self) == -1)
12366 return NULL;
12367
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012368 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012369
Victor Stinnercc7af722013-04-09 22:39:24 +020012370 if (PyUnicode_IS_ASCII(self)) {
12371 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12372
12373 i = 0;
12374 if (striptype != RIGHTSTRIP) {
12375 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012376 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012377 if (!_Py_ascii_whitespace[ch])
12378 break;
12379 i++;
12380 }
12381 }
12382
12383 j = len;
12384 if (striptype != LEFTSTRIP) {
12385 j--;
12386 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012387 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012388 if (!_Py_ascii_whitespace[ch])
12389 break;
12390 j--;
12391 }
12392 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012393 }
12394 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012395 else {
12396 int kind = PyUnicode_KIND(self);
12397 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012398
Victor Stinnercc7af722013-04-09 22:39:24 +020012399 i = 0;
12400 if (striptype != RIGHTSTRIP) {
12401 while (i < len) {
12402 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12403 if (!Py_UNICODE_ISSPACE(ch))
12404 break;
12405 i++;
12406 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012407 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012408
12409 j = len;
12410 if (striptype != LEFTSTRIP) {
12411 j--;
12412 while (j >= i) {
12413 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12414 if (!Py_UNICODE_ISSPACE(ch))
12415 break;
12416 j--;
12417 }
12418 j++;
12419 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012420 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012421
Victor Stinner7931d9a2011-11-04 00:22:48 +010012422 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012423}
12424
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012425
12426static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012427do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012428{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012429 if (sep != NULL && sep != Py_None) {
12430 if (PyUnicode_Check(sep))
12431 return _PyUnicode_XStrip(self, striptype, sep);
12432 else {
12433 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012434 "%s arg must be None or str",
12435 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012436 return NULL;
12437 }
12438 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012439
Benjamin Peterson14339b62009-01-31 16:36:08 +000012440 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012441}
12442
12443
INADA Naoki3ae20562017-01-16 20:41:20 +090012444/*[clinic input]
12445str.strip as unicode_strip
12446
12447 chars: object = None
12448 /
12449
Victor Stinner0c4a8282017-01-17 02:21:47 +010012450Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012451
12452If chars is given and not None, remove characters in chars instead.
12453[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012454
12455static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012456unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012457/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012458{
INADA Naoki3ae20562017-01-16 20:41:20 +090012459 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012460}
12461
12462
INADA Naoki3ae20562017-01-16 20:41:20 +090012463/*[clinic input]
12464str.lstrip as unicode_lstrip
12465
12466 chars: object = NULL
12467 /
12468
12469Return a copy of the string with leading whitespace removed.
12470
12471If chars is given and not None, remove characters in chars instead.
12472[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012473
12474static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012475unicode_lstrip_impl(PyObject *self, PyObject *chars)
12476/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012477{
INADA Naoki3ae20562017-01-16 20:41:20 +090012478 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012479}
12480
12481
INADA Naoki3ae20562017-01-16 20:41:20 +090012482/*[clinic input]
12483str.rstrip as unicode_rstrip
12484
12485 chars: object = NULL
12486 /
12487
12488Return a copy of the string with trailing whitespace removed.
12489
12490If chars is given and not None, remove characters in chars instead.
12491[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012492
12493static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012494unicode_rstrip_impl(PyObject *self, PyObject *chars)
12495/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012496{
INADA Naoki3ae20562017-01-16 20:41:20 +090012497 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012498}
12499
12500
Guido van Rossumd57fd912000-03-10 22:53:23 +000012501static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012502unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012504 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012505 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012506
Serhiy Storchaka05997252013-01-26 12:14:02 +020012507 if (len < 1)
12508 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012509
Victor Stinnerc4b49542011-12-11 22:44:26 +010012510 /* no repeat, return original string */
12511 if (len == 1)
12512 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012513
Benjamin Petersonbac79492012-01-14 13:34:47 -050012514 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012515 return NULL;
12516
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012517 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012518 PyErr_SetString(PyExc_OverflowError,
12519 "repeated string is too long");
12520 return NULL;
12521 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012522 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012523
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012524 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525 if (!u)
12526 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012527 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012528
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012529 if (PyUnicode_GET_LENGTH(str) == 1) {
12530 const int kind = PyUnicode_KIND(str);
12531 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012532 if (kind == PyUnicode_1BYTE_KIND) {
12533 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012534 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012535 }
12536 else if (kind == PyUnicode_2BYTE_KIND) {
12537 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012538 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012539 ucs2[n] = fill_char;
12540 } else {
12541 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12542 assert(kind == PyUnicode_4BYTE_KIND);
12543 for (n = 0; n < len; ++n)
12544 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012545 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012546 }
12547 else {
12548 /* number of characters copied this far */
12549 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012550 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012551 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012552 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012553 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012554 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012555 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012556 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012557 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012558 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012559 }
12560
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012561 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012562 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563}
12564
Alexander Belopolsky40018472011-02-26 01:02:56 +000012565PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012566PyUnicode_Replace(PyObject *str,
12567 PyObject *substr,
12568 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012569 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012570{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012571 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12572 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012573 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012574 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012575}
12576
INADA Naoki3ae20562017-01-16 20:41:20 +090012577/*[clinic input]
12578str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012579
INADA Naoki3ae20562017-01-16 20:41:20 +090012580 old: unicode
12581 new: unicode
12582 count: Py_ssize_t = -1
12583 Maximum number of occurrences to replace.
12584 -1 (the default value) means replace all occurrences.
12585 /
12586
12587Return a copy with all occurrences of substring old replaced by new.
12588
12589If the optional argument count is given, only the first count occurrences are
12590replaced.
12591[clinic start generated code]*/
12592
12593static PyObject *
12594unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12595 Py_ssize_t count)
12596/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012597{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012598 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012599 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012600 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012601}
12602
Alexander Belopolsky40018472011-02-26 01:02:56 +000012603static PyObject *
12604unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012605{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012606 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012607 Py_ssize_t isize;
12608 Py_ssize_t osize, squote, dquote, i, o;
12609 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012610 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012611 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012612
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012613 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012614 return NULL;
12615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012616 isize = PyUnicode_GET_LENGTH(unicode);
12617 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012618
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012619 /* Compute length of output, quote characters, and
12620 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012621 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012622 max = 127;
12623 squote = dquote = 0;
12624 ikind = PyUnicode_KIND(unicode);
12625 for (i = 0; i < isize; i++) {
12626 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012627 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012628 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012629 case '\'': squote++; break;
12630 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012631 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012632 incr = 2;
12633 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012634 default:
12635 /* Fast-path ASCII */
12636 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012637 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012638 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012639 ;
12640 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012641 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012642 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012643 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012644 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012645 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012646 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012647 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012648 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012649 if (osize > PY_SSIZE_T_MAX - incr) {
12650 PyErr_SetString(PyExc_OverflowError,
12651 "string is too long to generate repr");
12652 return NULL;
12653 }
12654 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012655 }
12656
12657 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012658 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012659 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012660 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012661 if (dquote)
12662 /* Both squote and dquote present. Use squote,
12663 and escape them */
12664 osize += squote;
12665 else
12666 quote = '"';
12667 }
Victor Stinner55c08782013-04-14 18:45:39 +020012668 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012669
12670 repr = PyUnicode_New(osize, max);
12671 if (repr == NULL)
12672 return NULL;
12673 okind = PyUnicode_KIND(repr);
12674 odata = PyUnicode_DATA(repr);
12675
12676 PyUnicode_WRITE(okind, odata, 0, quote);
12677 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012678 if (unchanged) {
12679 _PyUnicode_FastCopyCharacters(repr, 1,
12680 unicode, 0,
12681 isize);
12682 }
12683 else {
12684 for (i = 0, o = 1; i < isize; i++) {
12685 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012686
Victor Stinner55c08782013-04-14 18:45:39 +020012687 /* Escape quotes and backslashes */
12688 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012689 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012690 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012691 continue;
12692 }
12693
12694 /* Map special whitespace to '\t', \n', '\r' */
12695 if (ch == '\t') {
12696 PyUnicode_WRITE(okind, odata, o++, '\\');
12697 PyUnicode_WRITE(okind, odata, o++, 't');
12698 }
12699 else if (ch == '\n') {
12700 PyUnicode_WRITE(okind, odata, o++, '\\');
12701 PyUnicode_WRITE(okind, odata, o++, 'n');
12702 }
12703 else if (ch == '\r') {
12704 PyUnicode_WRITE(okind, odata, o++, '\\');
12705 PyUnicode_WRITE(okind, odata, o++, 'r');
12706 }
12707
12708 /* Map non-printable US ASCII to '\xhh' */
12709 else if (ch < ' ' || ch == 0x7F) {
12710 PyUnicode_WRITE(okind, odata, o++, '\\');
12711 PyUnicode_WRITE(okind, odata, o++, 'x');
12712 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12713 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12714 }
12715
12716 /* Copy ASCII characters as-is */
12717 else if (ch < 0x7F) {
12718 PyUnicode_WRITE(okind, odata, o++, ch);
12719 }
12720
12721 /* Non-ASCII characters */
12722 else {
12723 /* Map Unicode whitespace and control characters
12724 (categories Z* and C* except ASCII space)
12725 */
12726 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12727 PyUnicode_WRITE(okind, odata, o++, '\\');
12728 /* Map 8-bit characters to '\xhh' */
12729 if (ch <= 0xff) {
12730 PyUnicode_WRITE(okind, odata, o++, 'x');
12731 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12732 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12733 }
12734 /* Map 16-bit characters to '\uxxxx' */
12735 else if (ch <= 0xffff) {
12736 PyUnicode_WRITE(okind, odata, o++, 'u');
12737 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12738 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12739 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12740 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12741 }
12742 /* Map 21-bit characters to '\U00xxxxxx' */
12743 else {
12744 PyUnicode_WRITE(okind, odata, o++, 'U');
12745 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12746 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12747 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12748 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12749 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12750 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12751 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12752 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12753 }
12754 }
12755 /* Copy characters as-is */
12756 else {
12757 PyUnicode_WRITE(okind, odata, o++, ch);
12758 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012759 }
12760 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012761 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012762 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012763 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012764 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012765}
12766
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012767PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012768 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012769\n\
12770Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012771such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012772arguments start and end are interpreted as in slice notation.\n\
12773\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012774Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012775
12776static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012777unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012778{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012779 /* initialize variables to prevent gcc warning */
12780 PyObject *substring = NULL;
12781 Py_ssize_t start = 0;
12782 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012783 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012784
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012785 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012786 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012787
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012788 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012789 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012790
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012791 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012793 if (result == -2)
12794 return NULL;
12795
Christian Heimes217cfd12007-12-02 14:31:20 +000012796 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012797}
12798
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012799PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012800 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012801\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012802Return the highest index in S where substring sub is found,\n\
12803such that sub is contained within S[start:end]. Optional\n\
12804arguments start and end are interpreted as in slice notation.\n\
12805\n\
12806Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012807
12808static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012809unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012810{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012811 /* initialize variables to prevent gcc warning */
12812 PyObject *substring = NULL;
12813 Py_ssize_t start = 0;
12814 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012815 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012816
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012817 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012818 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012819
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012820 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012821 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012822
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012823 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012825 if (result == -2)
12826 return NULL;
12827
Guido van Rossumd57fd912000-03-10 22:53:23 +000012828 if (result < 0) {
12829 PyErr_SetString(PyExc_ValueError, "substring not found");
12830 return NULL;
12831 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012832
Christian Heimes217cfd12007-12-02 14:31:20 +000012833 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012834}
12835
INADA Naoki3ae20562017-01-16 20:41:20 +090012836/*[clinic input]
12837str.rjust as unicode_rjust
12838
12839 width: Py_ssize_t
12840 fillchar: Py_UCS4 = ' '
12841 /
12842
12843Return a right-justified string of length width.
12844
12845Padding is done using the specified fill character (default is a space).
12846[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012847
12848static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012849unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12850/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012851{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012852 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012853 return NULL;
12854
Victor Stinnerc4b49542011-12-11 22:44:26 +010012855 if (PyUnicode_GET_LENGTH(self) >= width)
12856 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012857
Victor Stinnerc4b49542011-12-11 22:44:26 +010012858 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012859}
12860
Alexander Belopolsky40018472011-02-26 01:02:56 +000012861PyObject *
12862PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012863{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012864 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012865 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012866
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012867 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012868}
12869
INADA Naoki3ae20562017-01-16 20:41:20 +090012870/*[clinic input]
12871str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012872
INADA Naoki3ae20562017-01-16 20:41:20 +090012873 sep: object = None
12874 The delimiter according which to split the string.
12875 None (the default value) means split according to any whitespace,
12876 and discard empty strings from the result.
12877 maxsplit: Py_ssize_t = -1
12878 Maximum number of splits to do.
12879 -1 (the default value) means no limit.
12880
12881Return a list of the words in the string, using sep as the delimiter string.
12882[clinic start generated code]*/
12883
12884static PyObject *
12885unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12886/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012887{
INADA Naoki3ae20562017-01-16 20:41:20 +090012888 if (sep == Py_None)
12889 return split(self, NULL, maxsplit);
12890 if (PyUnicode_Check(sep))
12891 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012892
12893 PyErr_Format(PyExc_TypeError,
12894 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090012895 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012896 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012897}
12898
Thomas Wouters477c8d52006-05-27 19:21:47 +000012899PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012900PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012901{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012902 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012903 int kind1, kind2;
12904 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012905 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012906
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012907 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012908 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012909
Victor Stinner14f8f022011-10-05 20:58:25 +020012910 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012911 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012912 len1 = PyUnicode_GET_LENGTH(str_obj);
12913 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012914 if (kind1 < kind2 || len1 < len2) {
12915 _Py_INCREF_UNICODE_EMPTY();
12916 if (!unicode_empty)
12917 out = NULL;
12918 else {
12919 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12920 Py_DECREF(unicode_empty);
12921 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012922 return out;
12923 }
12924 buf1 = PyUnicode_DATA(str_obj);
12925 buf2 = PyUnicode_DATA(sep_obj);
12926 if (kind2 != kind1) {
12927 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12928 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012929 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012930 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012931
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012932 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012933 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012934 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12935 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12936 else
12937 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012938 break;
12939 case PyUnicode_2BYTE_KIND:
12940 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12941 break;
12942 case PyUnicode_4BYTE_KIND:
12943 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12944 break;
12945 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012946 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012947 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012948
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012949 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012950 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012951
12952 return out;
12953}
12954
12955
12956PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012957PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012958{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012959 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012960 int kind1, kind2;
12961 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012962 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012963
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012964 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012965 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012966
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012967 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012968 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012969 len1 = PyUnicode_GET_LENGTH(str_obj);
12970 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012971 if (kind1 < kind2 || len1 < len2) {
12972 _Py_INCREF_UNICODE_EMPTY();
12973 if (!unicode_empty)
12974 out = NULL;
12975 else {
12976 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12977 Py_DECREF(unicode_empty);
12978 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012979 return out;
12980 }
12981 buf1 = PyUnicode_DATA(str_obj);
12982 buf2 = PyUnicode_DATA(sep_obj);
12983 if (kind2 != kind1) {
12984 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12985 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012986 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012987 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012988
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012989 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012990 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012991 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12992 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12993 else
12994 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012995 break;
12996 case PyUnicode_2BYTE_KIND:
12997 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12998 break;
12999 case PyUnicode_4BYTE_KIND:
13000 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13001 break;
13002 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013003 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013004 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013005
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013006 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013007 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013008
13009 return out;
13010}
13011
INADA Naoki3ae20562017-01-16 20:41:20 +090013012/*[clinic input]
13013str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013014
INADA Naoki3ae20562017-01-16 20:41:20 +090013015 sep: object
13016 /
13017
13018Partition the string into three parts using the given separator.
13019
13020This will search for the separator in the string. If the separator is found,
13021returns a 3-tuple containing the part before the separator, the separator
13022itself, and the part after it.
13023
13024If the separator is not found, returns a 3-tuple containing the original string
13025and two empty strings.
13026[clinic start generated code]*/
13027
13028static PyObject *
13029unicode_partition(PyObject *self, PyObject *sep)
13030/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013031{
INADA Naoki3ae20562017-01-16 20:41:20 +090013032 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013033}
13034
INADA Naoki3ae20562017-01-16 20:41:20 +090013035/*[clinic input]
13036str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013037
INADA Naoki3ae20562017-01-16 20:41:20 +090013038Partition the string into three parts using the given separator.
13039
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013040This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013041the separator is found, returns a 3-tuple containing the part before the
13042separator, the separator itself, and the part after it.
13043
13044If the separator is not found, returns a 3-tuple containing two empty strings
13045and the original string.
13046[clinic start generated code]*/
13047
13048static PyObject *
13049unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013050/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013051{
INADA Naoki3ae20562017-01-16 20:41:20 +090013052 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013053}
13054
Alexander Belopolsky40018472011-02-26 01:02:56 +000013055PyObject *
13056PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013057{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013058 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013059 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013060
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013061 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013062}
13063
INADA Naoki3ae20562017-01-16 20:41:20 +090013064/*[clinic input]
13065str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013066
INADA Naoki3ae20562017-01-16 20:41:20 +090013067Return a list of the words in the string, using sep as the delimiter string.
13068
13069Splits are done starting at the end of the string and working to the front.
13070[clinic start generated code]*/
13071
13072static PyObject *
13073unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13074/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013075{
INADA Naoki3ae20562017-01-16 20:41:20 +090013076 if (sep == Py_None)
13077 return rsplit(self, NULL, maxsplit);
13078 if (PyUnicode_Check(sep))
13079 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013080
13081 PyErr_Format(PyExc_TypeError,
13082 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090013083 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013084 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013085}
13086
INADA Naoki3ae20562017-01-16 20:41:20 +090013087/*[clinic input]
13088str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013089
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013090 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013091
13092Return a list of the lines in the string, breaking at line boundaries.
13093
13094Line breaks are not included in the resulting list unless keepends is given and
13095true.
13096[clinic start generated code]*/
13097
13098static PyObject *
13099unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013100/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013101{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013102 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013103}
13104
13105static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013106PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013107{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013108 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013109}
13110
INADA Naoki3ae20562017-01-16 20:41:20 +090013111/*[clinic input]
13112str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113
INADA Naoki3ae20562017-01-16 20:41:20 +090013114Convert uppercase characters to lowercase and lowercase characters to uppercase.
13115[clinic start generated code]*/
13116
13117static PyObject *
13118unicode_swapcase_impl(PyObject *self)
13119/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013120{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013121 if (PyUnicode_READY(self) == -1)
13122 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013123 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124}
13125
Larry Hastings61272b72014-01-07 12:41:53 -080013126/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013127
Larry Hastings31826802013-10-19 00:09:25 -070013128@staticmethod
13129str.maketrans as unicode_maketrans
13130
13131 x: object
13132
13133 y: unicode=NULL
13134
13135 z: unicode=NULL
13136
13137 /
13138
13139Return a translation table usable for str.translate().
13140
13141If there is only one argument, it must be a dictionary mapping Unicode
13142ordinals (integers) or characters to Unicode ordinals, strings or None.
13143Character keys will be then converted to ordinals.
13144If there are two arguments, they must be strings of equal length, and
13145in the resulting dictionary, each character in x will be mapped to the
13146character at the same position in y. If there is a third argument, it
13147must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013148[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013149
Larry Hastings31826802013-10-19 00:09:25 -070013150static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013151unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013152/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013153{
Georg Brandlceee0772007-11-27 23:48:05 +000013154 PyObject *new = NULL, *key, *value;
13155 Py_ssize_t i = 0;
13156 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013157
Georg Brandlceee0772007-11-27 23:48:05 +000013158 new = PyDict_New();
13159 if (!new)
13160 return NULL;
13161 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013162 int x_kind, y_kind, z_kind;
13163 void *x_data, *y_data, *z_data;
13164
Georg Brandlceee0772007-11-27 23:48:05 +000013165 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013166 if (!PyUnicode_Check(x)) {
13167 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13168 "be a string if there is a second argument");
13169 goto err;
13170 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013171 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013172 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13173 "arguments must have equal length");
13174 goto err;
13175 }
13176 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013177 x_kind = PyUnicode_KIND(x);
13178 y_kind = PyUnicode_KIND(y);
13179 x_data = PyUnicode_DATA(x);
13180 y_data = PyUnicode_DATA(y);
13181 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13182 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013183 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013184 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013185 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013186 if (!value) {
13187 Py_DECREF(key);
13188 goto err;
13189 }
Georg Brandlceee0772007-11-27 23:48:05 +000013190 res = PyDict_SetItem(new, key, value);
13191 Py_DECREF(key);
13192 Py_DECREF(value);
13193 if (res < 0)
13194 goto err;
13195 }
13196 /* create entries for deleting chars in z */
13197 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013198 z_kind = PyUnicode_KIND(z);
13199 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013200 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013201 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013202 if (!key)
13203 goto err;
13204 res = PyDict_SetItem(new, key, Py_None);
13205 Py_DECREF(key);
13206 if (res < 0)
13207 goto err;
13208 }
13209 }
13210 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013211 int kind;
13212 void *data;
13213
Georg Brandlceee0772007-11-27 23:48:05 +000013214 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013215 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013216 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13217 "to maketrans it must be a dict");
13218 goto err;
13219 }
13220 /* copy entries into the new dict, converting string keys to int keys */
13221 while (PyDict_Next(x, &i, &key, &value)) {
13222 if (PyUnicode_Check(key)) {
13223 /* convert string keys to integer keys */
13224 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013225 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013226 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13227 "table must be of length 1");
13228 goto err;
13229 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013230 kind = PyUnicode_KIND(key);
13231 data = PyUnicode_DATA(key);
13232 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013233 if (!newkey)
13234 goto err;
13235 res = PyDict_SetItem(new, newkey, value);
13236 Py_DECREF(newkey);
13237 if (res < 0)
13238 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013239 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013240 /* just keep integer keys */
13241 if (PyDict_SetItem(new, key, value) < 0)
13242 goto err;
13243 } else {
13244 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13245 "be strings or integers");
13246 goto err;
13247 }
13248 }
13249 }
13250 return new;
13251 err:
13252 Py_DECREF(new);
13253 return NULL;
13254}
13255
INADA Naoki3ae20562017-01-16 20:41:20 +090013256/*[clinic input]
13257str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013258
INADA Naoki3ae20562017-01-16 20:41:20 +090013259 table: object
13260 Translation table, which must be a mapping of Unicode ordinals to
13261 Unicode ordinals, strings, or None.
13262 /
13263
13264Replace each character in the string using the given translation table.
13265
13266The table must implement lookup/indexing via __getitem__, for instance a
13267dictionary or list. If this operation raises LookupError, the character is
13268left untouched. Characters mapped to None are deleted.
13269[clinic start generated code]*/
13270
13271static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013272unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013273/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013274{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013275 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013276}
13277
INADA Naoki3ae20562017-01-16 20:41:20 +090013278/*[clinic input]
13279str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013280
INADA Naoki3ae20562017-01-16 20:41:20 +090013281Return a copy of the string converted to uppercase.
13282[clinic start generated code]*/
13283
13284static PyObject *
13285unicode_upper_impl(PyObject *self)
13286/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013287{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013288 if (PyUnicode_READY(self) == -1)
13289 return NULL;
13290 if (PyUnicode_IS_ASCII(self))
13291 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013292 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013293}
13294
INADA Naoki3ae20562017-01-16 20:41:20 +090013295/*[clinic input]
13296str.zfill as unicode_zfill
13297
13298 width: Py_ssize_t
13299 /
13300
13301Pad a numeric string with zeros on the left, to fill a field of the given width.
13302
13303The string is never truncated.
13304[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013305
13306static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013307unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013308/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013309{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013310 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013311 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013312 int kind;
13313 void *data;
13314 Py_UCS4 chr;
13315
Benjamin Petersonbac79492012-01-14 13:34:47 -050013316 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013317 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013318
Victor Stinnerc4b49542011-12-11 22:44:26 +010013319 if (PyUnicode_GET_LENGTH(self) >= width)
13320 return unicode_result_unchanged(self);
13321
13322 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013323
13324 u = pad(self, fill, 0, '0');
13325
Walter Dörwald068325e2002-04-15 13:36:47 +000013326 if (u == NULL)
13327 return NULL;
13328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013329 kind = PyUnicode_KIND(u);
13330 data = PyUnicode_DATA(u);
13331 chr = PyUnicode_READ(kind, data, fill);
13332
13333 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013334 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013335 PyUnicode_WRITE(kind, data, 0, chr);
13336 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013337 }
13338
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013339 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013340 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013341}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013342
13343#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013344static PyObject *
13345unicode__decimal2ascii(PyObject *self)
13346{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013347 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013348}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013349#endif
13350
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013351PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013352 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013353\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013354Return True if S starts with the specified prefix, False otherwise.\n\
13355With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013356With optional end, stop comparing S at that position.\n\
13357prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013358
13359static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013360unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013361 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013362{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013363 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013364 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013365 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013366 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013367 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013368
Jesus Ceaac451502011-04-20 17:09:23 +020013369 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013370 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013371 if (PyTuple_Check(subobj)) {
13372 Py_ssize_t i;
13373 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013374 substring = PyTuple_GET_ITEM(subobj, i);
13375 if (!PyUnicode_Check(substring)) {
13376 PyErr_Format(PyExc_TypeError,
13377 "tuple for startswith must only contain str, "
13378 "not %.100s",
13379 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013380 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013381 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013382 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013383 if (result == -1)
13384 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013385 if (result) {
13386 Py_RETURN_TRUE;
13387 }
13388 }
13389 /* nothing matched */
13390 Py_RETURN_FALSE;
13391 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013392 if (!PyUnicode_Check(subobj)) {
13393 PyErr_Format(PyExc_TypeError,
13394 "startswith first arg must be str or "
13395 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013396 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013397 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013398 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013399 if (result == -1)
13400 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013401 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013402}
13403
13404
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013405PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013406 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013407\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013408Return True if S ends with the specified suffix, False otherwise.\n\
13409With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013410With optional end, stop comparing S at that position.\n\
13411suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013412
13413static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013414unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013415 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013416{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013417 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013418 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013419 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013420 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013421 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013422
Jesus Ceaac451502011-04-20 17:09:23 +020013423 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013424 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013425 if (PyTuple_Check(subobj)) {
13426 Py_ssize_t i;
13427 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013428 substring = PyTuple_GET_ITEM(subobj, i);
13429 if (!PyUnicode_Check(substring)) {
13430 PyErr_Format(PyExc_TypeError,
13431 "tuple for endswith must only contain str, "
13432 "not %.100s",
13433 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013434 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013435 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013436 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013437 if (result == -1)
13438 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013439 if (result) {
13440 Py_RETURN_TRUE;
13441 }
13442 }
13443 Py_RETURN_FALSE;
13444 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013445 if (!PyUnicode_Check(subobj)) {
13446 PyErr_Format(PyExc_TypeError,
13447 "endswith first arg must be str or "
13448 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013449 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013450 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013451 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013452 if (result == -1)
13453 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013454 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013455}
13456
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013457static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013458_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013459{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013460 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13461 writer->data = PyUnicode_DATA(writer->buffer);
13462
13463 if (!writer->readonly) {
13464 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013465 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013466 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013467 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013468 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13469 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13470 writer->kind = PyUnicode_WCHAR_KIND;
13471 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13472
Victor Stinner8f674cc2013-04-17 23:02:17 +020013473 /* Copy-on-write mode: set buffer size to 0 so
13474 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13475 * next write. */
13476 writer->size = 0;
13477 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013478}
13479
Victor Stinnerd3f08822012-05-29 12:57:52 +020013480void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013481_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013482{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013483 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013484
13485 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013486 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013487
13488 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13489 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13490 writer->kind = PyUnicode_WCHAR_KIND;
13491 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013492}
13493
Victor Stinnerd3f08822012-05-29 12:57:52 +020013494int
13495_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13496 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013497{
13498 Py_ssize_t newlen;
13499 PyObject *newbuffer;
13500
Victor Stinner2740e462016-09-06 16:58:36 -070013501 assert(maxchar <= MAX_UNICODE);
13502
Victor Stinnerca9381e2015-09-22 00:58:32 +020013503 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013504 assert((maxchar > writer->maxchar && length >= 0)
13505 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013506
Victor Stinner202fdca2012-05-07 12:47:02 +020013507 if (length > PY_SSIZE_T_MAX - writer->pos) {
13508 PyErr_NoMemory();
13509 return -1;
13510 }
13511 newlen = writer->pos + length;
13512
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013513 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013514
Victor Stinnerd3f08822012-05-29 12:57:52 +020013515 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013516 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013517 if (writer->overallocate
13518 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13519 /* overallocate to limit the number of realloc() */
13520 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013521 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013522 if (newlen < writer->min_length)
13523 newlen = writer->min_length;
13524
Victor Stinnerd3f08822012-05-29 12:57:52 +020013525 writer->buffer = PyUnicode_New(newlen, maxchar);
13526 if (writer->buffer == NULL)
13527 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013528 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013529 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013530 if (writer->overallocate
13531 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13532 /* overallocate to limit the number of realloc() */
13533 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013534 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013535 if (newlen < writer->min_length)
13536 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013537
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013538 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013539 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013540 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013541 newbuffer = PyUnicode_New(newlen, maxchar);
13542 if (newbuffer == NULL)
13543 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013544 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13545 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013546 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013547 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013548 }
13549 else {
13550 newbuffer = resize_compact(writer->buffer, newlen);
13551 if (newbuffer == NULL)
13552 return -1;
13553 }
13554 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013555 }
13556 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013557 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013558 newbuffer = PyUnicode_New(writer->size, maxchar);
13559 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013560 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013561 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13562 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013563 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013564 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013565 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013566 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013567
13568#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013569}
13570
Victor Stinnerca9381e2015-09-22 00:58:32 +020013571int
13572_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13573 enum PyUnicode_Kind kind)
13574{
13575 Py_UCS4 maxchar;
13576
13577 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13578 assert(writer->kind < kind);
13579
13580 switch (kind)
13581 {
13582 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13583 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13584 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13585 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013586 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013587 }
13588
13589 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13590}
13591
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013592static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013593_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013594{
Victor Stinner2740e462016-09-06 16:58:36 -070013595 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013596 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13597 return -1;
13598 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13599 writer->pos++;
13600 return 0;
13601}
13602
13603int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013604_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13605{
13606 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13607}
13608
13609int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013610_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13611{
13612 Py_UCS4 maxchar;
13613 Py_ssize_t len;
13614
13615 if (PyUnicode_READY(str) == -1)
13616 return -1;
13617 len = PyUnicode_GET_LENGTH(str);
13618 if (len == 0)
13619 return 0;
13620 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13621 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013622 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013623 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013624 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013625 Py_INCREF(str);
13626 writer->buffer = str;
13627 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013628 writer->pos += len;
13629 return 0;
13630 }
13631 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13632 return -1;
13633 }
13634 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13635 str, 0, len);
13636 writer->pos += len;
13637 return 0;
13638}
13639
Victor Stinnere215d962012-10-06 23:03:36 +020013640int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013641_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13642 Py_ssize_t start, Py_ssize_t end)
13643{
13644 Py_UCS4 maxchar;
13645 Py_ssize_t len;
13646
13647 if (PyUnicode_READY(str) == -1)
13648 return -1;
13649
13650 assert(0 <= start);
13651 assert(end <= PyUnicode_GET_LENGTH(str));
13652 assert(start <= end);
13653
13654 if (end == 0)
13655 return 0;
13656
13657 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13658 return _PyUnicodeWriter_WriteStr(writer, str);
13659
13660 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13661 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13662 else
13663 maxchar = writer->maxchar;
13664 len = end - start;
13665
13666 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13667 return -1;
13668
13669 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13670 str, start, len);
13671 writer->pos += len;
13672 return 0;
13673}
13674
13675int
Victor Stinner4a587072013-11-19 12:54:53 +010013676_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13677 const char *ascii, Py_ssize_t len)
13678{
13679 if (len == -1)
13680 len = strlen(ascii);
13681
13682 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13683
13684 if (writer->buffer == NULL && !writer->overallocate) {
13685 PyObject *str;
13686
13687 str = _PyUnicode_FromASCII(ascii, len);
13688 if (str == NULL)
13689 return -1;
13690
13691 writer->readonly = 1;
13692 writer->buffer = str;
13693 _PyUnicodeWriter_Update(writer);
13694 writer->pos += len;
13695 return 0;
13696 }
13697
13698 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13699 return -1;
13700
13701 switch (writer->kind)
13702 {
13703 case PyUnicode_1BYTE_KIND:
13704 {
13705 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13706 Py_UCS1 *data = writer->data;
13707
Christian Heimesf051e432016-09-13 20:22:02 +020013708 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013709 break;
13710 }
13711 case PyUnicode_2BYTE_KIND:
13712 {
13713 _PyUnicode_CONVERT_BYTES(
13714 Py_UCS1, Py_UCS2,
13715 ascii, ascii + len,
13716 (Py_UCS2 *)writer->data + writer->pos);
13717 break;
13718 }
13719 case PyUnicode_4BYTE_KIND:
13720 {
13721 _PyUnicode_CONVERT_BYTES(
13722 Py_UCS1, Py_UCS4,
13723 ascii, ascii + len,
13724 (Py_UCS4 *)writer->data + writer->pos);
13725 break;
13726 }
13727 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013728 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013729 }
13730
13731 writer->pos += len;
13732 return 0;
13733}
13734
13735int
13736_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13737 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013738{
13739 Py_UCS4 maxchar;
13740
13741 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13742 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13743 return -1;
13744 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13745 writer->pos += len;
13746 return 0;
13747}
13748
Victor Stinnerd3f08822012-05-29 12:57:52 +020013749PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013750_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013751{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013752 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013753
Victor Stinnerd3f08822012-05-29 12:57:52 +020013754 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013755 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013756 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013757 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013758
13759 str = writer->buffer;
13760 writer->buffer = NULL;
13761
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013762 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013763 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13764 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013765 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013766
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013767 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13768 PyObject *str2;
13769 str2 = resize_compact(str, writer->pos);
13770 if (str2 == NULL) {
13771 Py_DECREF(str);
13772 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013773 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013774 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013775 }
13776
Victor Stinner15a0bd32013-07-08 22:29:55 +020013777 assert(_PyUnicode_CheckConsistency(str, 1));
13778 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013779}
13780
Victor Stinnerd3f08822012-05-29 12:57:52 +020013781void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013782_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013783{
13784 Py_CLEAR(writer->buffer);
13785}
13786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013787#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013788
13789PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013790 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013791\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013792Return a formatted version of S, using substitutions from args and kwargs.\n\
13793The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013794
Eric Smith27bbca62010-11-04 17:06:58 +000013795PyDoc_STRVAR(format_map__doc__,
13796 "S.format_map(mapping) -> str\n\
13797\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013798Return a formatted version of S, using substitutions from mapping.\n\
13799The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013800
INADA Naoki3ae20562017-01-16 20:41:20 +090013801/*[clinic input]
13802str.__format__ as unicode___format__
13803
13804 format_spec: unicode
13805 /
13806
13807Return a formatted version of the string as described by format_spec.
13808[clinic start generated code]*/
13809
Eric Smith4a7d76d2008-05-30 18:10:19 +000013810static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013811unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013812/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013813{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013814 _PyUnicodeWriter writer;
13815 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013816
Victor Stinnerd3f08822012-05-29 12:57:52 +020013817 if (PyUnicode_READY(self) == -1)
13818 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013819 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013820 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13821 self, format_spec, 0,
13822 PyUnicode_GET_LENGTH(format_spec));
13823 if (ret == -1) {
13824 _PyUnicodeWriter_Dealloc(&writer);
13825 return NULL;
13826 }
13827 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013828}
13829
INADA Naoki3ae20562017-01-16 20:41:20 +090013830/*[clinic input]
13831str.__sizeof__ as unicode_sizeof
13832
13833Return the size of the string in memory, in bytes.
13834[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013835
13836static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013837unicode_sizeof_impl(PyObject *self)
13838/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013839{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013840 Py_ssize_t size;
13841
13842 /* If it's a compact object, account for base structure +
13843 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013844 if (PyUnicode_IS_COMPACT_ASCII(self))
13845 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13846 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013847 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013848 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013849 else {
13850 /* If it is a two-block object, account for base object, and
13851 for character block if present. */
13852 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013853 if (_PyUnicode_DATA_ANY(self))
13854 size += (PyUnicode_GET_LENGTH(self) + 1) *
13855 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013856 }
13857 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013858 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013859 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13860 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13861 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13862 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013863
13864 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013865}
13866
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013867static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013868unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013869{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013870 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013871 if (!copy)
13872 return NULL;
13873 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013874}
13875
Guido van Rossumd57fd912000-03-10 22:53:23 +000013876static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013877 UNICODE_ENCODE_METHODDEF
13878 UNICODE_REPLACE_METHODDEF
13879 UNICODE_SPLIT_METHODDEF
13880 UNICODE_RSPLIT_METHODDEF
13881 UNICODE_JOIN_METHODDEF
13882 UNICODE_CAPITALIZE_METHODDEF
13883 UNICODE_CASEFOLD_METHODDEF
13884 UNICODE_TITLE_METHODDEF
13885 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013886 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013887 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013888 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013889 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013890 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013891 UNICODE_LJUST_METHODDEF
13892 UNICODE_LOWER_METHODDEF
13893 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013894 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13895 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013896 UNICODE_RJUST_METHODDEF
13897 UNICODE_RSTRIP_METHODDEF
13898 UNICODE_RPARTITION_METHODDEF
13899 UNICODE_SPLITLINES_METHODDEF
13900 UNICODE_STRIP_METHODDEF
13901 UNICODE_SWAPCASE_METHODDEF
13902 UNICODE_TRANSLATE_METHODDEF
13903 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013904 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13905 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090013906 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013907 UNICODE_ISLOWER_METHODDEF
13908 UNICODE_ISUPPER_METHODDEF
13909 UNICODE_ISTITLE_METHODDEF
13910 UNICODE_ISSPACE_METHODDEF
13911 UNICODE_ISDECIMAL_METHODDEF
13912 UNICODE_ISDIGIT_METHODDEF
13913 UNICODE_ISNUMERIC_METHODDEF
13914 UNICODE_ISALPHA_METHODDEF
13915 UNICODE_ISALNUM_METHODDEF
13916 UNICODE_ISIDENTIFIER_METHODDEF
13917 UNICODE_ISPRINTABLE_METHODDEF
13918 UNICODE_ZFILL_METHODDEF
Eric Smith9cd1e092007-08-31 18:39:38 +000013919 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013920 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013921 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013922 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013923 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013924#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013925 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013926 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013927#endif
13928
Benjamin Peterson14339b62009-01-31 16:36:08 +000013929 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013930 {NULL, NULL}
13931};
13932
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013933static PyObject *
13934unicode_mod(PyObject *v, PyObject *w)
13935{
Brian Curtindfc80e32011-08-10 20:28:54 -050013936 if (!PyUnicode_Check(v))
13937 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013938 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013939}
13940
13941static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013942 0, /*nb_add*/
13943 0, /*nb_subtract*/
13944 0, /*nb_multiply*/
13945 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013946};
13947
Guido van Rossumd57fd912000-03-10 22:53:23 +000013948static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013949 (lenfunc) unicode_length, /* sq_length */
13950 PyUnicode_Concat, /* sq_concat */
13951 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13952 (ssizeargfunc) unicode_getitem, /* sq_item */
13953 0, /* sq_slice */
13954 0, /* sq_ass_item */
13955 0, /* sq_ass_slice */
13956 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013957};
13958
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013959static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013960unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013961{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013962 if (PyUnicode_READY(self) == -1)
13963 return NULL;
13964
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013965 if (PyIndex_Check(item)) {
13966 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013967 if (i == -1 && PyErr_Occurred())
13968 return NULL;
13969 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013970 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013971 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013972 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013973 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013974 PyObject *result;
13975 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013976 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013977 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013978
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013979 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013980 return NULL;
13981 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013982 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13983 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013984
13985 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013986 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013987 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013988 slicelength == PyUnicode_GET_LENGTH(self)) {
13989 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013990 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013991 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013992 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013993 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013994 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013995 src_kind = PyUnicode_KIND(self);
13996 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013997 if (!PyUnicode_IS_ASCII(self)) {
13998 kind_limit = kind_maxchar_limit(src_kind);
13999 max_char = 0;
14000 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14001 ch = PyUnicode_READ(src_kind, src_data, cur);
14002 if (ch > max_char) {
14003 max_char = ch;
14004 if (max_char >= kind_limit)
14005 break;
14006 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014007 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014008 }
Victor Stinner55c99112011-10-13 01:17:06 +020014009 else
14010 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014011 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014012 if (result == NULL)
14013 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014014 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014015 dest_data = PyUnicode_DATA(result);
14016
14017 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014018 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14019 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014020 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014021 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014022 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014023 } else {
14024 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14025 return NULL;
14026 }
14027}
14028
14029static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014030 (lenfunc)unicode_length, /* mp_length */
14031 (binaryfunc)unicode_subscript, /* mp_subscript */
14032 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014033};
14034
Guido van Rossumd57fd912000-03-10 22:53:23 +000014035
Guido van Rossumd57fd912000-03-10 22:53:23 +000014036/* Helpers for PyUnicode_Format() */
14037
Victor Stinnera47082312012-10-04 02:19:54 +020014038struct unicode_formatter_t {
14039 PyObject *args;
14040 int args_owned;
14041 Py_ssize_t arglen, argidx;
14042 PyObject *dict;
14043
14044 enum PyUnicode_Kind fmtkind;
14045 Py_ssize_t fmtcnt, fmtpos;
14046 void *fmtdata;
14047 PyObject *fmtstr;
14048
14049 _PyUnicodeWriter writer;
14050};
14051
14052struct unicode_format_arg_t {
14053 Py_UCS4 ch;
14054 int flags;
14055 Py_ssize_t width;
14056 int prec;
14057 int sign;
14058};
14059
Guido van Rossumd57fd912000-03-10 22:53:23 +000014060static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014061unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014062{
Victor Stinnera47082312012-10-04 02:19:54 +020014063 Py_ssize_t argidx = ctx->argidx;
14064
14065 if (argidx < ctx->arglen) {
14066 ctx->argidx++;
14067 if (ctx->arglen < 0)
14068 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014069 else
Victor Stinnera47082312012-10-04 02:19:54 +020014070 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014071 }
14072 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014073 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014074 return NULL;
14075}
14076
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014077/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014078
Victor Stinnera47082312012-10-04 02:19:54 +020014079/* Format a float into the writer if the writer is not NULL, or into *p_output
14080 otherwise.
14081
14082 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014083static int
Victor Stinnera47082312012-10-04 02:19:54 +020014084formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14085 PyObject **p_output,
14086 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014087{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014088 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014089 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014090 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014091 int prec;
14092 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014093
Guido van Rossumd57fd912000-03-10 22:53:23 +000014094 x = PyFloat_AsDouble(v);
14095 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014096 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014097
Victor Stinnera47082312012-10-04 02:19:54 +020014098 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014099 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014100 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014101
Victor Stinnera47082312012-10-04 02:19:54 +020014102 if (arg->flags & F_ALT)
14103 dtoa_flags = Py_DTSF_ALT;
14104 else
14105 dtoa_flags = 0;
14106 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014107 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014108 return -1;
14109 len = strlen(p);
14110 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014111 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014112 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014113 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014114 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014115 }
14116 else
14117 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014118 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014119 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014120}
14121
Victor Stinnerd0880d52012-04-27 23:40:13 +020014122/* formatlong() emulates the format codes d, u, o, x and X, and
14123 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14124 * Python's regular ints.
14125 * Return value: a new PyUnicodeObject*, or NULL if error.
14126 * The output string is of the form
14127 * "-"? ("0x" | "0X")? digit+
14128 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14129 * set in flags. The case of hex digits will be correct,
14130 * There will be at least prec digits, zero-filled on the left if
14131 * necessary to get that many.
14132 * val object to be converted
14133 * flags bitmask of format flags; only F_ALT is looked at
14134 * prec minimum number of digits; 0-fill on left if needed
14135 * type a character in [duoxX]; u acts the same as d
14136 *
14137 * CAUTION: o, x and X conversions on regular ints can never
14138 * produce a '-' sign, but can for Python's unbounded ints.
14139 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014140PyObject *
14141_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014142{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014143 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014144 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014145 Py_ssize_t i;
14146 int sign; /* 1 if '-', else 0 */
14147 int len; /* number of characters */
14148 Py_ssize_t llen;
14149 int numdigits; /* len == numnondigits + numdigits */
14150 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014151
Victor Stinnerd0880d52012-04-27 23:40:13 +020014152 /* Avoid exceeding SSIZE_T_MAX */
14153 if (prec > INT_MAX-3) {
14154 PyErr_SetString(PyExc_OverflowError,
14155 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014156 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014157 }
14158
14159 assert(PyLong_Check(val));
14160
14161 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014162 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014163 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014164 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014165 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014166 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014167 /* int and int subclasses should print numerically when a numeric */
14168 /* format code is used (see issue18780) */
14169 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014170 break;
14171 case 'o':
14172 numnondigits = 2;
14173 result = PyNumber_ToBase(val, 8);
14174 break;
14175 case 'x':
14176 case 'X':
14177 numnondigits = 2;
14178 result = PyNumber_ToBase(val, 16);
14179 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014180 }
14181 if (!result)
14182 return NULL;
14183
14184 assert(unicode_modifiable(result));
14185 assert(PyUnicode_IS_READY(result));
14186 assert(PyUnicode_IS_ASCII(result));
14187
14188 /* To modify the string in-place, there can only be one reference. */
14189 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014190 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014191 PyErr_BadInternalCall();
14192 return NULL;
14193 }
14194 buf = PyUnicode_DATA(result);
14195 llen = PyUnicode_GET_LENGTH(result);
14196 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014197 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014198 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014199 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014200 return NULL;
14201 }
14202 len = (int)llen;
14203 sign = buf[0] == '-';
14204 numnondigits += sign;
14205 numdigits = len - numnondigits;
14206 assert(numdigits > 0);
14207
14208 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014209 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014210 (type == 'o' || type == 'x' || type == 'X'))) {
14211 assert(buf[sign] == '0');
14212 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14213 buf[sign+1] == 'o');
14214 numnondigits -= 2;
14215 buf += 2;
14216 len -= 2;
14217 if (sign)
14218 buf[0] = '-';
14219 assert(len == numnondigits + numdigits);
14220 assert(numdigits > 0);
14221 }
14222
14223 /* Fill with leading zeroes to meet minimum width. */
14224 if (prec > numdigits) {
14225 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14226 numnondigits + prec);
14227 char *b1;
14228 if (!r1) {
14229 Py_DECREF(result);
14230 return NULL;
14231 }
14232 b1 = PyBytes_AS_STRING(r1);
14233 for (i = 0; i < numnondigits; ++i)
14234 *b1++ = *buf++;
14235 for (i = 0; i < prec - numdigits; i++)
14236 *b1++ = '0';
14237 for (i = 0; i < numdigits; i++)
14238 *b1++ = *buf++;
14239 *b1 = '\0';
14240 Py_DECREF(result);
14241 result = r1;
14242 buf = PyBytes_AS_STRING(result);
14243 len = numnondigits + prec;
14244 }
14245
14246 /* Fix up case for hex conversions. */
14247 if (type == 'X') {
14248 /* Need to convert all lower case letters to upper case.
14249 and need to convert 0x to 0X (and -0x to -0X). */
14250 for (i = 0; i < len; i++)
14251 if (buf[i] >= 'a' && buf[i] <= 'x')
14252 buf[i] -= 'a'-'A';
14253 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014254 if (!PyUnicode_Check(result)
14255 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014256 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014257 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014258 Py_DECREF(result);
14259 result = unicode;
14260 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014261 else if (len != PyUnicode_GET_LENGTH(result)) {
14262 if (PyUnicode_Resize(&result, len) < 0)
14263 Py_CLEAR(result);
14264 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014265 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014266}
14267
Ethan Furmandf3ed242014-01-05 06:50:30 -080014268/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014269 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014270 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014271 * -1 and raise an exception on error */
14272static int
Victor Stinnera47082312012-10-04 02:19:54 +020014273mainformatlong(PyObject *v,
14274 struct unicode_format_arg_t *arg,
14275 PyObject **p_output,
14276 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014277{
14278 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014279 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014280
14281 if (!PyNumber_Check(v))
14282 goto wrongtype;
14283
Ethan Furman9ab74802014-03-21 06:38:46 -070014284 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014285 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014286 if (type == 'o' || type == 'x' || type == 'X') {
14287 iobj = PyNumber_Index(v);
14288 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014289 if (PyErr_ExceptionMatches(PyExc_TypeError))
14290 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014291 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014292 }
14293 }
14294 else {
14295 iobj = PyNumber_Long(v);
14296 if (iobj == NULL ) {
14297 if (PyErr_ExceptionMatches(PyExc_TypeError))
14298 goto wrongtype;
14299 return -1;
14300 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014301 }
14302 assert(PyLong_Check(iobj));
14303 }
14304 else {
14305 iobj = v;
14306 Py_INCREF(iobj);
14307 }
14308
14309 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014310 && arg->width == -1 && arg->prec == -1
14311 && !(arg->flags & (F_SIGN | F_BLANK))
14312 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014313 {
14314 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014315 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014316 int base;
14317
Victor Stinnera47082312012-10-04 02:19:54 +020014318 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014319 {
14320 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014321 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014322 case 'd':
14323 case 'i':
14324 case 'u':
14325 base = 10;
14326 break;
14327 case 'o':
14328 base = 8;
14329 break;
14330 case 'x':
14331 case 'X':
14332 base = 16;
14333 break;
14334 }
14335
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014336 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14337 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014338 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014339 }
14340 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014341 return 1;
14342 }
14343
Ethan Furmanb95b5612015-01-23 20:05:18 -080014344 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014345 Py_DECREF(iobj);
14346 if (res == NULL)
14347 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014348 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014349 return 0;
14350
14351wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014352 switch(type)
14353 {
14354 case 'o':
14355 case 'x':
14356 case 'X':
14357 PyErr_Format(PyExc_TypeError,
14358 "%%%c format: an integer is required, "
14359 "not %.200s",
14360 type, Py_TYPE(v)->tp_name);
14361 break;
14362 default:
14363 PyErr_Format(PyExc_TypeError,
14364 "%%%c format: a number is required, "
14365 "not %.200s",
14366 type, Py_TYPE(v)->tp_name);
14367 break;
14368 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014369 return -1;
14370}
14371
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014372static Py_UCS4
14373formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014374{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014375 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014376 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014377 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014378 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014379 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014380 goto onError;
14381 }
14382 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014383 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014384 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014385 /* make sure number is a type of integer */
14386 if (!PyLong_Check(v)) {
14387 iobj = PyNumber_Index(v);
14388 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014389 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014390 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014391 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014392 Py_DECREF(iobj);
14393 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014394 else {
14395 x = PyLong_AsLong(v);
14396 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014397 if (x == -1 && PyErr_Occurred())
14398 goto onError;
14399
Victor Stinner8faf8212011-12-08 22:14:11 +010014400 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014401 PyErr_SetString(PyExc_OverflowError,
14402 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014403 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014404 }
14405
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014406 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014407 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014408
Benjamin Peterson29060642009-01-31 22:14:21 +000014409 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014410 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014411 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014412 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014413}
14414
Victor Stinnera47082312012-10-04 02:19:54 +020014415/* Parse options of an argument: flags, width, precision.
14416 Handle also "%(name)" syntax.
14417
14418 Return 0 if the argument has been formatted into arg->str.
14419 Return 1 if the argument has been written into ctx->writer,
14420 Raise an exception and return -1 on error. */
14421static int
14422unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14423 struct unicode_format_arg_t *arg)
14424{
14425#define FORMAT_READ(ctx) \
14426 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14427
14428 PyObject *v;
14429
Victor Stinnera47082312012-10-04 02:19:54 +020014430 if (arg->ch == '(') {
14431 /* Get argument value from a dictionary. Example: "%(name)s". */
14432 Py_ssize_t keystart;
14433 Py_ssize_t keylen;
14434 PyObject *key;
14435 int pcount = 1;
14436
14437 if (ctx->dict == NULL) {
14438 PyErr_SetString(PyExc_TypeError,
14439 "format requires a mapping");
14440 return -1;
14441 }
14442 ++ctx->fmtpos;
14443 --ctx->fmtcnt;
14444 keystart = ctx->fmtpos;
14445 /* Skip over balanced parentheses */
14446 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14447 arg->ch = FORMAT_READ(ctx);
14448 if (arg->ch == ')')
14449 --pcount;
14450 else if (arg->ch == '(')
14451 ++pcount;
14452 ctx->fmtpos++;
14453 }
14454 keylen = ctx->fmtpos - keystart - 1;
14455 if (ctx->fmtcnt < 0 || pcount > 0) {
14456 PyErr_SetString(PyExc_ValueError,
14457 "incomplete format key");
14458 return -1;
14459 }
14460 key = PyUnicode_Substring(ctx->fmtstr,
14461 keystart, keystart + keylen);
14462 if (key == NULL)
14463 return -1;
14464 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014465 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014466 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014467 }
14468 ctx->args = PyObject_GetItem(ctx->dict, key);
14469 Py_DECREF(key);
14470 if (ctx->args == NULL)
14471 return -1;
14472 ctx->args_owned = 1;
14473 ctx->arglen = -1;
14474 ctx->argidx = -2;
14475 }
14476
14477 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014478 while (--ctx->fmtcnt >= 0) {
14479 arg->ch = FORMAT_READ(ctx);
14480 ctx->fmtpos++;
14481 switch (arg->ch) {
14482 case '-': arg->flags |= F_LJUST; continue;
14483 case '+': arg->flags |= F_SIGN; continue;
14484 case ' ': arg->flags |= F_BLANK; continue;
14485 case '#': arg->flags |= F_ALT; continue;
14486 case '0': arg->flags |= F_ZERO; continue;
14487 }
14488 break;
14489 }
14490
14491 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014492 if (arg->ch == '*') {
14493 v = unicode_format_getnextarg(ctx);
14494 if (v == NULL)
14495 return -1;
14496 if (!PyLong_Check(v)) {
14497 PyErr_SetString(PyExc_TypeError,
14498 "* wants int");
14499 return -1;
14500 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014501 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014502 if (arg->width == -1 && PyErr_Occurred())
14503 return -1;
14504 if (arg->width < 0) {
14505 arg->flags |= F_LJUST;
14506 arg->width = -arg->width;
14507 }
14508 if (--ctx->fmtcnt >= 0) {
14509 arg->ch = FORMAT_READ(ctx);
14510 ctx->fmtpos++;
14511 }
14512 }
14513 else if (arg->ch >= '0' && arg->ch <= '9') {
14514 arg->width = arg->ch - '0';
14515 while (--ctx->fmtcnt >= 0) {
14516 arg->ch = FORMAT_READ(ctx);
14517 ctx->fmtpos++;
14518 if (arg->ch < '0' || arg->ch > '9')
14519 break;
14520 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14521 mixing signed and unsigned comparison. Since arg->ch is between
14522 '0' and '9', casting to int is safe. */
14523 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14524 PyErr_SetString(PyExc_ValueError,
14525 "width too big");
14526 return -1;
14527 }
14528 arg->width = arg->width*10 + (arg->ch - '0');
14529 }
14530 }
14531
14532 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014533 if (arg->ch == '.') {
14534 arg->prec = 0;
14535 if (--ctx->fmtcnt >= 0) {
14536 arg->ch = FORMAT_READ(ctx);
14537 ctx->fmtpos++;
14538 }
14539 if (arg->ch == '*') {
14540 v = unicode_format_getnextarg(ctx);
14541 if (v == NULL)
14542 return -1;
14543 if (!PyLong_Check(v)) {
14544 PyErr_SetString(PyExc_TypeError,
14545 "* wants int");
14546 return -1;
14547 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014548 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014549 if (arg->prec == -1 && PyErr_Occurred())
14550 return -1;
14551 if (arg->prec < 0)
14552 arg->prec = 0;
14553 if (--ctx->fmtcnt >= 0) {
14554 arg->ch = FORMAT_READ(ctx);
14555 ctx->fmtpos++;
14556 }
14557 }
14558 else if (arg->ch >= '0' && arg->ch <= '9') {
14559 arg->prec = arg->ch - '0';
14560 while (--ctx->fmtcnt >= 0) {
14561 arg->ch = FORMAT_READ(ctx);
14562 ctx->fmtpos++;
14563 if (arg->ch < '0' || arg->ch > '9')
14564 break;
14565 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14566 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014567 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014568 return -1;
14569 }
14570 arg->prec = arg->prec*10 + (arg->ch - '0');
14571 }
14572 }
14573 }
14574
14575 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14576 if (ctx->fmtcnt >= 0) {
14577 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14578 if (--ctx->fmtcnt >= 0) {
14579 arg->ch = FORMAT_READ(ctx);
14580 ctx->fmtpos++;
14581 }
14582 }
14583 }
14584 if (ctx->fmtcnt < 0) {
14585 PyErr_SetString(PyExc_ValueError,
14586 "incomplete format");
14587 return -1;
14588 }
14589 return 0;
14590
14591#undef FORMAT_READ
14592}
14593
14594/* Format one argument. Supported conversion specifiers:
14595
14596 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014597 - "i", "d", "u": int or float
14598 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014599 - "e", "E", "f", "F", "g", "G": float
14600 - "c": int or str (1 character)
14601
Victor Stinner8dbd4212012-12-04 09:30:24 +010014602 When possible, the output is written directly into the Unicode writer
14603 (ctx->writer). A string is created when padding is required.
14604
Victor Stinnera47082312012-10-04 02:19:54 +020014605 Return 0 if the argument has been formatted into *p_str,
14606 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014607 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014608static int
14609unicode_format_arg_format(struct unicode_formatter_t *ctx,
14610 struct unicode_format_arg_t *arg,
14611 PyObject **p_str)
14612{
14613 PyObject *v;
14614 _PyUnicodeWriter *writer = &ctx->writer;
14615
14616 if (ctx->fmtcnt == 0)
14617 ctx->writer.overallocate = 0;
14618
Victor Stinnera47082312012-10-04 02:19:54 +020014619 v = unicode_format_getnextarg(ctx);
14620 if (v == NULL)
14621 return -1;
14622
Victor Stinnera47082312012-10-04 02:19:54 +020014623
14624 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014625 case 's':
14626 case 'r':
14627 case 'a':
14628 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14629 /* Fast path */
14630 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14631 return -1;
14632 return 1;
14633 }
14634
14635 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14636 *p_str = v;
14637 Py_INCREF(*p_str);
14638 }
14639 else {
14640 if (arg->ch == 's')
14641 *p_str = PyObject_Str(v);
14642 else if (arg->ch == 'r')
14643 *p_str = PyObject_Repr(v);
14644 else
14645 *p_str = PyObject_ASCII(v);
14646 }
14647 break;
14648
14649 case 'i':
14650 case 'd':
14651 case 'u':
14652 case 'o':
14653 case 'x':
14654 case 'X':
14655 {
14656 int ret = mainformatlong(v, arg, p_str, writer);
14657 if (ret != 0)
14658 return ret;
14659 arg->sign = 1;
14660 break;
14661 }
14662
14663 case 'e':
14664 case 'E':
14665 case 'f':
14666 case 'F':
14667 case 'g':
14668 case 'G':
14669 if (arg->width == -1 && arg->prec == -1
14670 && !(arg->flags & (F_SIGN | F_BLANK)))
14671 {
14672 /* Fast path */
14673 if (formatfloat(v, arg, NULL, writer) == -1)
14674 return -1;
14675 return 1;
14676 }
14677
14678 arg->sign = 1;
14679 if (formatfloat(v, arg, p_str, NULL) == -1)
14680 return -1;
14681 break;
14682
14683 case 'c':
14684 {
14685 Py_UCS4 ch = formatchar(v);
14686 if (ch == (Py_UCS4) -1)
14687 return -1;
14688 if (arg->width == -1 && arg->prec == -1) {
14689 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014690 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014691 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014692 return 1;
14693 }
14694 *p_str = PyUnicode_FromOrdinal(ch);
14695 break;
14696 }
14697
14698 default:
14699 PyErr_Format(PyExc_ValueError,
14700 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014701 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014702 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14703 (int)arg->ch,
14704 ctx->fmtpos - 1);
14705 return -1;
14706 }
14707 if (*p_str == NULL)
14708 return -1;
14709 assert (PyUnicode_Check(*p_str));
14710 return 0;
14711}
14712
14713static int
14714unicode_format_arg_output(struct unicode_formatter_t *ctx,
14715 struct unicode_format_arg_t *arg,
14716 PyObject *str)
14717{
14718 Py_ssize_t len;
14719 enum PyUnicode_Kind kind;
14720 void *pbuf;
14721 Py_ssize_t pindex;
14722 Py_UCS4 signchar;
14723 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014724 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014725 Py_ssize_t sublen;
14726 _PyUnicodeWriter *writer = &ctx->writer;
14727 Py_UCS4 fill;
14728
14729 fill = ' ';
14730 if (arg->sign && arg->flags & F_ZERO)
14731 fill = '0';
14732
14733 if (PyUnicode_READY(str) == -1)
14734 return -1;
14735
14736 len = PyUnicode_GET_LENGTH(str);
14737 if ((arg->width == -1 || arg->width <= len)
14738 && (arg->prec == -1 || arg->prec >= len)
14739 && !(arg->flags & (F_SIGN | F_BLANK)))
14740 {
14741 /* Fast path */
14742 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14743 return -1;
14744 return 0;
14745 }
14746
14747 /* Truncate the string for "s", "r" and "a" formats
14748 if the precision is set */
14749 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14750 if (arg->prec >= 0 && len > arg->prec)
14751 len = arg->prec;
14752 }
14753
14754 /* Adjust sign and width */
14755 kind = PyUnicode_KIND(str);
14756 pbuf = PyUnicode_DATA(str);
14757 pindex = 0;
14758 signchar = '\0';
14759 if (arg->sign) {
14760 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14761 if (ch == '-' || ch == '+') {
14762 signchar = ch;
14763 len--;
14764 pindex++;
14765 }
14766 else if (arg->flags & F_SIGN)
14767 signchar = '+';
14768 else if (arg->flags & F_BLANK)
14769 signchar = ' ';
14770 else
14771 arg->sign = 0;
14772 }
14773 if (arg->width < len)
14774 arg->width = len;
14775
14776 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014777 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014778 if (!(arg->flags & F_LJUST)) {
14779 if (arg->sign) {
14780 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014781 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014782 }
14783 else {
14784 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014785 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014786 }
14787 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014788 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14789 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014790 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014791 }
14792
Victor Stinnera47082312012-10-04 02:19:54 +020014793 buflen = arg->width;
14794 if (arg->sign && len == arg->width)
14795 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014796 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014797 return -1;
14798
14799 /* Write the sign if needed */
14800 if (arg->sign) {
14801 if (fill != ' ') {
14802 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14803 writer->pos += 1;
14804 }
14805 if (arg->width > len)
14806 arg->width--;
14807 }
14808
14809 /* Write the numeric prefix for "x", "X" and "o" formats
14810 if the alternate form is used.
14811 For example, write "0x" for the "%#x" format. */
14812 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14813 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14814 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14815 if (fill != ' ') {
14816 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14817 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14818 writer->pos += 2;
14819 pindex += 2;
14820 }
14821 arg->width -= 2;
14822 if (arg->width < 0)
14823 arg->width = 0;
14824 len -= 2;
14825 }
14826
14827 /* Pad left with the fill character if needed */
14828 if (arg->width > len && !(arg->flags & F_LJUST)) {
14829 sublen = arg->width - len;
14830 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14831 writer->pos += sublen;
14832 arg->width = len;
14833 }
14834
14835 /* If padding with spaces: write sign if needed and/or numeric prefix if
14836 the alternate form is used */
14837 if (fill == ' ') {
14838 if (arg->sign) {
14839 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14840 writer->pos += 1;
14841 }
14842 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14843 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14844 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14845 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14846 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14847 writer->pos += 2;
14848 pindex += 2;
14849 }
14850 }
14851
14852 /* Write characters */
14853 if (len) {
14854 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14855 str, pindex, len);
14856 writer->pos += len;
14857 }
14858
14859 /* Pad right with the fill character if needed */
14860 if (arg->width > len) {
14861 sublen = arg->width - len;
14862 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14863 writer->pos += sublen;
14864 }
14865 return 0;
14866}
14867
14868/* Helper of PyUnicode_Format(): format one arg.
14869 Return 0 on success, raise an exception and return -1 on error. */
14870static int
14871unicode_format_arg(struct unicode_formatter_t *ctx)
14872{
14873 struct unicode_format_arg_t arg;
14874 PyObject *str;
14875 int ret;
14876
Victor Stinner8dbd4212012-12-04 09:30:24 +010014877 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014878 if (arg.ch == '%') {
14879 ctx->fmtpos++;
14880 ctx->fmtcnt--;
14881 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14882 return -1;
14883 return 0;
14884 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014885 arg.flags = 0;
14886 arg.width = -1;
14887 arg.prec = -1;
14888 arg.sign = 0;
14889 str = NULL;
14890
Victor Stinnera47082312012-10-04 02:19:54 +020014891 ret = unicode_format_arg_parse(ctx, &arg);
14892 if (ret == -1)
14893 return -1;
14894
14895 ret = unicode_format_arg_format(ctx, &arg, &str);
14896 if (ret == -1)
14897 return -1;
14898
14899 if (ret != 1) {
14900 ret = unicode_format_arg_output(ctx, &arg, str);
14901 Py_DECREF(str);
14902 if (ret == -1)
14903 return -1;
14904 }
14905
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014906 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014907 PyErr_SetString(PyExc_TypeError,
14908 "not all arguments converted during string formatting");
14909 return -1;
14910 }
14911 return 0;
14912}
14913
Alexander Belopolsky40018472011-02-26 01:02:56 +000014914PyObject *
14915PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014916{
Victor Stinnera47082312012-10-04 02:19:54 +020014917 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014918
Guido van Rossumd57fd912000-03-10 22:53:23 +000014919 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014920 PyErr_BadInternalCall();
14921 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014922 }
Victor Stinnera47082312012-10-04 02:19:54 +020014923
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014924 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014925 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014926
14927 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014928 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14929 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14930 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14931 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014932
Victor Stinner8f674cc2013-04-17 23:02:17 +020014933 _PyUnicodeWriter_Init(&ctx.writer);
14934 ctx.writer.min_length = ctx.fmtcnt + 100;
14935 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014936
Guido van Rossumd57fd912000-03-10 22:53:23 +000014937 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014938 ctx.arglen = PyTuple_Size(args);
14939 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014940 }
14941 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014942 ctx.arglen = -1;
14943 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014944 }
Victor Stinnera47082312012-10-04 02:19:54 +020014945 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014946 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014947 ctx.dict = args;
14948 else
14949 ctx.dict = NULL;
14950 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014951
Victor Stinnera47082312012-10-04 02:19:54 +020014952 while (--ctx.fmtcnt >= 0) {
14953 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014954 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014955
14956 nonfmtpos = ctx.fmtpos++;
14957 while (ctx.fmtcnt >= 0 &&
14958 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14959 ctx.fmtpos++;
14960 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014961 }
Victor Stinnera47082312012-10-04 02:19:54 +020014962 if (ctx.fmtcnt < 0) {
14963 ctx.fmtpos--;
14964 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014965 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014966
Victor Stinnercfc4c132013-04-03 01:48:39 +020014967 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14968 nonfmtpos, ctx.fmtpos) < 0)
14969 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014970 }
14971 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014972 ctx.fmtpos++;
14973 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014974 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014975 }
14976 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014977
Victor Stinnera47082312012-10-04 02:19:54 +020014978 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014979 PyErr_SetString(PyExc_TypeError,
14980 "not all arguments converted during string formatting");
14981 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014982 }
14983
Victor Stinnera47082312012-10-04 02:19:54 +020014984 if (ctx.args_owned) {
14985 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014986 }
Victor Stinnera47082312012-10-04 02:19:54 +020014987 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014988
Benjamin Peterson29060642009-01-31 22:14:21 +000014989 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014990 _PyUnicodeWriter_Dealloc(&ctx.writer);
14991 if (ctx.args_owned) {
14992 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014993 }
14994 return NULL;
14995}
14996
Jeremy Hylton938ace62002-07-17 16:30:39 +000014997static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014998unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14999
Tim Peters6d6c1a32001-08-02 04:15:00 +000015000static PyObject *
15001unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15002{
Benjamin Peterson29060642009-01-31 22:14:21 +000015003 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015004 static char *kwlist[] = {"object", "encoding", "errors", 0};
15005 char *encoding = NULL;
15006 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015007
Benjamin Peterson14339b62009-01-31 16:36:08 +000015008 if (type != &PyUnicode_Type)
15009 return unicode_subtype_new(type, args, kwds);
15010 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015011 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015012 return NULL;
15013 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015014 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015015 if (encoding == NULL && errors == NULL)
15016 return PyObject_Str(x);
15017 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015018 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015019}
15020
Guido van Rossume023fe02001-08-30 03:12:59 +000015021static PyObject *
15022unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15023{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015024 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015025 Py_ssize_t length, char_size;
15026 int share_wstr, share_utf8;
15027 unsigned int kind;
15028 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015029
Benjamin Peterson14339b62009-01-31 16:36:08 +000015030 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015031
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015032 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015033 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015034 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015035 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015036 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015037 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015038 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015039 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015040
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015041 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015042 if (self == NULL) {
15043 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015044 return NULL;
15045 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015046 kind = PyUnicode_KIND(unicode);
15047 length = PyUnicode_GET_LENGTH(unicode);
15048
15049 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015050#ifdef Py_DEBUG
15051 _PyUnicode_HASH(self) = -1;
15052#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015053 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015054#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015055 _PyUnicode_STATE(self).interned = 0;
15056 _PyUnicode_STATE(self).kind = kind;
15057 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015058 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015059 _PyUnicode_STATE(self).ready = 1;
15060 _PyUnicode_WSTR(self) = NULL;
15061 _PyUnicode_UTF8_LENGTH(self) = 0;
15062 _PyUnicode_UTF8(self) = NULL;
15063 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015064 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015065
15066 share_utf8 = 0;
15067 share_wstr = 0;
15068 if (kind == PyUnicode_1BYTE_KIND) {
15069 char_size = 1;
15070 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15071 share_utf8 = 1;
15072 }
15073 else if (kind == PyUnicode_2BYTE_KIND) {
15074 char_size = 2;
15075 if (sizeof(wchar_t) == 2)
15076 share_wstr = 1;
15077 }
15078 else {
15079 assert(kind == PyUnicode_4BYTE_KIND);
15080 char_size = 4;
15081 if (sizeof(wchar_t) == 4)
15082 share_wstr = 1;
15083 }
15084
15085 /* Ensure we won't overflow the length. */
15086 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15087 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015088 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015089 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015090 data = PyObject_MALLOC((length + 1) * char_size);
15091 if (data == NULL) {
15092 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015093 goto onError;
15094 }
15095
Victor Stinnerc3c74152011-10-02 20:39:55 +020015096 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015097 if (share_utf8) {
15098 _PyUnicode_UTF8_LENGTH(self) = length;
15099 _PyUnicode_UTF8(self) = data;
15100 }
15101 if (share_wstr) {
15102 _PyUnicode_WSTR_LENGTH(self) = length;
15103 _PyUnicode_WSTR(self) = (wchar_t *)data;
15104 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015105
Christian Heimesf051e432016-09-13 20:22:02 +020015106 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015107 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015108 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015109#ifdef Py_DEBUG
15110 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15111#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015112 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015113 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015114
15115onError:
15116 Py_DECREF(unicode);
15117 Py_DECREF(self);
15118 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015119}
15120
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015121PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015122"str(object='') -> str\n\
15123str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015124\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015125Create a new string object from the given object. If encoding or\n\
15126errors is specified, then the object must expose a data buffer\n\
15127that will be decoded using the given encoding and error handler.\n\
15128Otherwise, returns the result of object.__str__() (if defined)\n\
15129or repr(object).\n\
15130encoding defaults to sys.getdefaultencoding().\n\
15131errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015132
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015133static PyObject *unicode_iter(PyObject *seq);
15134
Guido van Rossumd57fd912000-03-10 22:53:23 +000015135PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015136 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015137 "str", /* tp_name */
15138 sizeof(PyUnicodeObject), /* tp_size */
15139 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015140 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015141 (destructor)unicode_dealloc, /* tp_dealloc */
15142 0, /* tp_print */
15143 0, /* tp_getattr */
15144 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015145 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015146 unicode_repr, /* tp_repr */
15147 &unicode_as_number, /* tp_as_number */
15148 &unicode_as_sequence, /* tp_as_sequence */
15149 &unicode_as_mapping, /* tp_as_mapping */
15150 (hashfunc) unicode_hash, /* tp_hash*/
15151 0, /* tp_call*/
15152 (reprfunc) unicode_str, /* tp_str */
15153 PyObject_GenericGetAttr, /* tp_getattro */
15154 0, /* tp_setattro */
15155 0, /* tp_as_buffer */
15156 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000015157 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015158 unicode_doc, /* tp_doc */
15159 0, /* tp_traverse */
15160 0, /* tp_clear */
15161 PyUnicode_RichCompare, /* tp_richcompare */
15162 0, /* tp_weaklistoffset */
15163 unicode_iter, /* tp_iter */
15164 0, /* tp_iternext */
15165 unicode_methods, /* tp_methods */
15166 0, /* tp_members */
15167 0, /* tp_getset */
15168 &PyBaseObject_Type, /* tp_base */
15169 0, /* tp_dict */
15170 0, /* tp_descr_get */
15171 0, /* tp_descr_set */
15172 0, /* tp_dictoffset */
15173 0, /* tp_init */
15174 0, /* tp_alloc */
15175 unicode_new, /* tp_new */
15176 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015177};
15178
15179/* Initialize the Unicode implementation */
15180
Victor Stinner3a50e702011-10-18 21:21:00 +020015181int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015182{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015183 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015184 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015185 0x000A, /* LINE FEED */
15186 0x000D, /* CARRIAGE RETURN */
15187 0x001C, /* FILE SEPARATOR */
15188 0x001D, /* GROUP SEPARATOR */
15189 0x001E, /* RECORD SEPARATOR */
15190 0x0085, /* NEXT LINE */
15191 0x2028, /* LINE SEPARATOR */
15192 0x2029, /* PARAGRAPH SEPARATOR */
15193 };
15194
Fred Drakee4315f52000-05-09 19:53:39 +000015195 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015196 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015197 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015198 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015199 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015200
Guido van Rossumcacfc072002-05-24 19:01:59 +000015201 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015202 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015203
15204 /* initialize the linebreak bloom filter */
15205 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015206 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015207 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015208
Christian Heimes26532f72013-07-20 14:57:16 +020015209 if (PyType_Ready(&EncodingMapType) < 0)
15210 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015211
Benjamin Petersonc4311282012-10-30 23:21:10 -040015212 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15213 Py_FatalError("Can't initialize field name iterator type");
15214
15215 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15216 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015217
Victor Stinner3a50e702011-10-18 21:21:00 +020015218 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015219}
15220
15221/* Finalize the Unicode implementation */
15222
Christian Heimesa156e092008-02-16 07:38:31 +000015223int
15224PyUnicode_ClearFreeList(void)
15225{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015226 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015227}
15228
Guido van Rossumd57fd912000-03-10 22:53:23 +000015229void
Thomas Wouters78890102000-07-22 19:25:51 +000015230_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015231{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015232 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015233
Serhiy Storchaka05997252013-01-26 12:14:02 +020015234 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015235
Serhiy Storchaka05997252013-01-26 12:14:02 +020015236 for (i = 0; i < 256; i++)
15237 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015238 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015239 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015240}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015241
Walter Dörwald16807132007-05-25 13:52:07 +000015242void
15243PyUnicode_InternInPlace(PyObject **p)
15244{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015245 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015246 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015247#ifdef Py_DEBUG
15248 assert(s != NULL);
15249 assert(_PyUnicode_CHECK(s));
15250#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015251 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015252 return;
15253#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015254 /* If it's a subclass, we don't really know what putting
15255 it in the interned dict might do. */
15256 if (!PyUnicode_CheckExact(s))
15257 return;
15258 if (PyUnicode_CHECK_INTERNED(s))
15259 return;
15260 if (interned == NULL) {
15261 interned = PyDict_New();
15262 if (interned == NULL) {
15263 PyErr_Clear(); /* Don't leave an exception */
15264 return;
15265 }
15266 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015267 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015268 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015269 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015270 if (t == NULL) {
15271 PyErr_Clear();
15272 return;
15273 }
15274 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015275 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015276 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015277 return;
15278 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015279 /* The two references in interned are not counted by refcnt.
15280 The deallocator will take care of this */
15281 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015282 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015283}
15284
15285void
15286PyUnicode_InternImmortal(PyObject **p)
15287{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015288 PyUnicode_InternInPlace(p);
15289 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015290 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015291 Py_INCREF(*p);
15292 }
Walter Dörwald16807132007-05-25 13:52:07 +000015293}
15294
15295PyObject *
15296PyUnicode_InternFromString(const char *cp)
15297{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015298 PyObject *s = PyUnicode_FromString(cp);
15299 if (s == NULL)
15300 return NULL;
15301 PyUnicode_InternInPlace(&s);
15302 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015303}
15304
Alexander Belopolsky40018472011-02-26 01:02:56 +000015305void
15306_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015307{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015308 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015309 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015310 Py_ssize_t i, n;
15311 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015312
Benjamin Peterson14339b62009-01-31 16:36:08 +000015313 if (interned == NULL || !PyDict_Check(interned))
15314 return;
15315 keys = PyDict_Keys(interned);
15316 if (keys == NULL || !PyList_Check(keys)) {
15317 PyErr_Clear();
15318 return;
15319 }
Walter Dörwald16807132007-05-25 13:52:07 +000015320
Benjamin Peterson14339b62009-01-31 16:36:08 +000015321 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15322 detector, interned unicode strings are not forcibly deallocated;
15323 rather, we give them their stolen references back, and then clear
15324 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015325
Benjamin Peterson14339b62009-01-31 16:36:08 +000015326 n = PyList_GET_SIZE(keys);
15327 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015328 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015329 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015330 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015331 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015332 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015333 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015334 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015335 case SSTATE_NOT_INTERNED:
15336 /* XXX Shouldn't happen */
15337 break;
15338 case SSTATE_INTERNED_IMMORTAL:
15339 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015340 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015341 break;
15342 case SSTATE_INTERNED_MORTAL:
15343 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015344 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015345 break;
15346 default:
15347 Py_FatalError("Inconsistent interned string state.");
15348 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015349 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015350 }
15351 fprintf(stderr, "total size of all interned strings: "
15352 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15353 "mortal/immortal\n", mortal_size, immortal_size);
15354 Py_DECREF(keys);
15355 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015356 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015357}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015358
15359
15360/********************* Unicode Iterator **************************/
15361
15362typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015363 PyObject_HEAD
15364 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015365 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015366} unicodeiterobject;
15367
15368static void
15369unicodeiter_dealloc(unicodeiterobject *it)
15370{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015371 _PyObject_GC_UNTRACK(it);
15372 Py_XDECREF(it->it_seq);
15373 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015374}
15375
15376static int
15377unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15378{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015379 Py_VISIT(it->it_seq);
15380 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015381}
15382
15383static PyObject *
15384unicodeiter_next(unicodeiterobject *it)
15385{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015386 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015387
Benjamin Peterson14339b62009-01-31 16:36:08 +000015388 assert(it != NULL);
15389 seq = it->it_seq;
15390 if (seq == NULL)
15391 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015392 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015393
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015394 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15395 int kind = PyUnicode_KIND(seq);
15396 void *data = PyUnicode_DATA(seq);
15397 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15398 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015399 if (item != NULL)
15400 ++it->it_index;
15401 return item;
15402 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015403
Benjamin Peterson14339b62009-01-31 16:36:08 +000015404 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015405 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015406 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015407}
15408
15409static PyObject *
15410unicodeiter_len(unicodeiterobject *it)
15411{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015412 Py_ssize_t len = 0;
15413 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015414 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015415 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015416}
15417
15418PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15419
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015420static PyObject *
15421unicodeiter_reduce(unicodeiterobject *it)
15422{
15423 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015424 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015425 it->it_seq, it->it_index);
15426 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015427 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015428 if (u == NULL)
15429 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015430 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015431 }
15432}
15433
15434PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15435
15436static PyObject *
15437unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15438{
15439 Py_ssize_t index = PyLong_AsSsize_t(state);
15440 if (index == -1 && PyErr_Occurred())
15441 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015442 if (it->it_seq != NULL) {
15443 if (index < 0)
15444 index = 0;
15445 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15446 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15447 it->it_index = index;
15448 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015449 Py_RETURN_NONE;
15450}
15451
15452PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15453
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015454static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015455 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015456 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015457 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15458 reduce_doc},
15459 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15460 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015461 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015462};
15463
15464PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015465 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15466 "str_iterator", /* tp_name */
15467 sizeof(unicodeiterobject), /* tp_basicsize */
15468 0, /* tp_itemsize */
15469 /* methods */
15470 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15471 0, /* tp_print */
15472 0, /* tp_getattr */
15473 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015474 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015475 0, /* tp_repr */
15476 0, /* tp_as_number */
15477 0, /* tp_as_sequence */
15478 0, /* tp_as_mapping */
15479 0, /* tp_hash */
15480 0, /* tp_call */
15481 0, /* tp_str */
15482 PyObject_GenericGetAttr, /* tp_getattro */
15483 0, /* tp_setattro */
15484 0, /* tp_as_buffer */
15485 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15486 0, /* tp_doc */
15487 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15488 0, /* tp_clear */
15489 0, /* tp_richcompare */
15490 0, /* tp_weaklistoffset */
15491 PyObject_SelfIter, /* tp_iter */
15492 (iternextfunc)unicodeiter_next, /* tp_iternext */
15493 unicodeiter_methods, /* tp_methods */
15494 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015495};
15496
15497static PyObject *
15498unicode_iter(PyObject *seq)
15499{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015500 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015501
Benjamin Peterson14339b62009-01-31 16:36:08 +000015502 if (!PyUnicode_Check(seq)) {
15503 PyErr_BadInternalCall();
15504 return NULL;
15505 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015506 if (PyUnicode_READY(seq) == -1)
15507 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015508 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15509 if (it == NULL)
15510 return NULL;
15511 it->it_index = 0;
15512 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015513 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015514 _PyObject_GC_TRACK(it);
15515 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015516}
15517
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015518
15519size_t
15520Py_UNICODE_strlen(const Py_UNICODE *u)
15521{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015522 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015523}
15524
15525Py_UNICODE*
15526Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15527{
15528 Py_UNICODE *u = s1;
15529 while ((*u++ = *s2++));
15530 return s1;
15531}
15532
15533Py_UNICODE*
15534Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15535{
15536 Py_UNICODE *u = s1;
15537 while ((*u++ = *s2++))
15538 if (n-- == 0)
15539 break;
15540 return s1;
15541}
15542
15543Py_UNICODE*
15544Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15545{
15546 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015547 u1 += wcslen(u1);
15548 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015549 return s1;
15550}
15551
15552int
15553Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15554{
15555 while (*s1 && *s2 && *s1 == *s2)
15556 s1++, s2++;
15557 if (*s1 && *s2)
15558 return (*s1 < *s2) ? -1 : +1;
15559 if (*s1)
15560 return 1;
15561 if (*s2)
15562 return -1;
15563 return 0;
15564}
15565
15566int
15567Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15568{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015569 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015570 for (; n != 0; n--) {
15571 u1 = *s1;
15572 u2 = *s2;
15573 if (u1 != u2)
15574 return (u1 < u2) ? -1 : +1;
15575 if (u1 == '\0')
15576 return 0;
15577 s1++;
15578 s2++;
15579 }
15580 return 0;
15581}
15582
15583Py_UNICODE*
15584Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15585{
15586 const Py_UNICODE *p;
15587 for (p = s; *p; p++)
15588 if (*p == c)
15589 return (Py_UNICODE*)p;
15590 return NULL;
15591}
15592
15593Py_UNICODE*
15594Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15595{
15596 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015597 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015598 while (p != s) {
15599 p--;
15600 if (*p == c)
15601 return (Py_UNICODE*)p;
15602 }
15603 return NULL;
15604}
Victor Stinner331ea922010-08-10 16:37:20 +000015605
Victor Stinner71133ff2010-09-01 23:43:53 +000015606Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015607PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015608{
Victor Stinner577db2c2011-10-11 22:12:48 +020015609 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015610 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015612 if (!PyUnicode_Check(unicode)) {
15613 PyErr_BadArgument();
15614 return NULL;
15615 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015616 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015617 if (u == NULL)
15618 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015619 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015620 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015621 PyErr_NoMemory();
15622 return NULL;
15623 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015624 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015625 size *= sizeof(Py_UNICODE);
15626 copy = PyMem_Malloc(size);
15627 if (copy == NULL) {
15628 PyErr_NoMemory();
15629 return NULL;
15630 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015631 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015632 return copy;
15633}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015634
Georg Brandl66c221e2010-10-14 07:04:07 +000015635/* A _string module, to export formatter_parser and formatter_field_name_split
15636 to the string.Formatter class implemented in Python. */
15637
15638static PyMethodDef _string_methods[] = {
15639 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15640 METH_O, PyDoc_STR("split the argument as a field name")},
15641 {"formatter_parser", (PyCFunction) formatter_parser,
15642 METH_O, PyDoc_STR("parse the argument as a format string")},
15643 {NULL, NULL}
15644};
15645
15646static struct PyModuleDef _string_module = {
15647 PyModuleDef_HEAD_INIT,
15648 "_string",
15649 PyDoc_STR("string helper module"),
15650 0,
15651 _string_methods,
15652 NULL,
15653 NULL,
15654 NULL,
15655 NULL
15656};
15657
15658PyMODINIT_FUNC
15659PyInit__string(void)
15660{
15661 return PyModule_Create(&_string_module);
15662}
15663
15664
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015665#ifdef __cplusplus
15666}
15667#endif