blob: 9991362a333036d8da8e75a3ff704031e2ebfbef [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010043#include "pycore_fileutils.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010044#include "pycore_object.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010045#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050047#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070048#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000049
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000050#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000051#include <windows.h>
52#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000053
Victor Stinnerfecc4f22019-03-19 14:20:29 +010054/* Uncomment to display statistics on interned strings at exit when
55 using Valgrind or Insecure++. */
56/* #define INTERNED_STATS 1 */
57
58
Larry Hastings61272b72014-01-07 12:41:53 -080059/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090060class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080061[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090062/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
63
64/*[python input]
65class Py_UCS4_converter(CConverter):
66 type = 'Py_UCS4'
67 converter = 'convert_uc'
68
69 def converter_init(self):
70 if self.default is not unspecified:
71 self.c_default = ascii(self.default)
72 if len(self.c_default) > 4 or self.c_default[0] != "'":
73 self.c_default = hex(ord(self.default))
74
75[python start generated code]*/
76/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080077
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000078/* --- Globals ------------------------------------------------------------
79
Serhiy Storchaka05997252013-01-26 12:14:02 +020080NOTE: In the interpreter's initialization phase, some globals are currently
81 initialized dynamically as needed. In the process Unicode objects may
82 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000086
87#ifdef __cplusplus
88extern "C" {
89#endif
90
Victor Stinner8faf8212011-12-08 22:14:11 +010091/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
92#define MAX_UNICODE 0x10ffff
93
Victor Stinner910337b2011-10-03 03:20:16 +020094#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020095# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020096#else
97# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
98#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020099
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200100#define _PyUnicode_UTF8(op) \
101 (((PyCompactUnicodeObject*)(op))->utf8)
102#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200103 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104 assert(PyUnicode_IS_READY(op)), \
105 PyUnicode_IS_COMPACT_ASCII(op) ? \
106 ((char*)((PyASCIIObject*)(op) + 1)) : \
107 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200108#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200109 (((PyCompactUnicodeObject*)(op))->utf8_length)
110#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200111 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200112 assert(PyUnicode_IS_READY(op)), \
113 PyUnicode_IS_COMPACT_ASCII(op) ? \
114 ((PyASCIIObject*)(op))->length : \
115 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200116#define _PyUnicode_WSTR(op) \
117 (((PyASCIIObject*)(op))->wstr)
118#define _PyUnicode_WSTR_LENGTH(op) \
119 (((PyCompactUnicodeObject*)(op))->wstr_length)
120#define _PyUnicode_LENGTH(op) \
121 (((PyASCIIObject *)(op))->length)
122#define _PyUnicode_STATE(op) \
123 (((PyASCIIObject *)(op))->state)
124#define _PyUnicode_HASH(op) \
125 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200126#define _PyUnicode_KIND(op) \
127 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200128 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_GET_LENGTH(op) \
130 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200131 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200132#define _PyUnicode_DATA_ANY(op) \
133 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200134
Victor Stinner910337b2011-10-03 03:20:16 +0200135#undef PyUnicode_READY
136#define PyUnicode_READY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200139 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100140 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200141
Victor Stinnerc379ead2011-10-03 12:52:27 +0200142#define _PyUnicode_SHARE_UTF8(op) \
143 (assert(_PyUnicode_CHECK(op)), \
144 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
145 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
146#define _PyUnicode_SHARE_WSTR(op) \
147 (assert(_PyUnicode_CHECK(op)), \
148 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
149
Victor Stinner829c0ad2011-10-03 01:08:02 +0200150/* true if the Unicode object has an allocated UTF-8 memory block
151 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200152#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200153 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200154 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200155 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
156
Victor Stinner03490912011-10-03 23:45:12 +0200157/* true if the Unicode object has an allocated wstr memory block
158 (not shared with other data) */
159#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200160 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200161 (!PyUnicode_IS_READY(op) || \
162 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
163
Victor Stinner910337b2011-10-03 03:20:16 +0200164/* Generic helper macro to convert characters of different types.
165 from_type and to_type have to be valid type names, begin and end
166 are pointers to the source characters which should be of type
167 "from_type *". to is a pointer of type "to_type *" and points to the
168 buffer where the result characters are written to. */
169#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
170 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100171 to_type *_to = (to_type *)(to); \
172 const from_type *_iter = (from_type *)(begin); \
173 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200174 Py_ssize_t n = (_end) - (_iter); \
175 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200176 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200177 while (_iter < (_unrolled_end)) { \
178 _to[0] = (to_type) _iter[0]; \
179 _to[1] = (to_type) _iter[1]; \
180 _to[2] = (to_type) _iter[2]; \
181 _to[3] = (to_type) _iter[3]; \
182 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200183 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200184 while (_iter < (_end)) \
185 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200186 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200187
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200188#ifdef MS_WINDOWS
189 /* On Windows, overallocate by 50% is the best factor */
190# define OVERALLOCATE_FACTOR 2
191#else
192 /* On Linux, overallocate by 25% is the best factor */
193# define OVERALLOCATE_FACTOR 4
194#endif
195
Walter Dörwald16807132007-05-25 13:52:07 +0000196/* This dictionary holds all interned unicode strings. Note that references
197 to strings in this dictionary are *not* counted in the string's ob_refcnt.
198 When the interned string reaches a refcnt of 0 the string deallocation
199 function will delete the reference from this dictionary.
200
201 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000202 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000203*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200204static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000205
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000206/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200207static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200208
Serhiy Storchaka678db842013-01-26 12:16:36 +0200209#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200210 do { \
211 if (unicode_empty != NULL) \
212 Py_INCREF(unicode_empty); \
213 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200214 unicode_empty = PyUnicode_New(0, 0); \
215 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200216 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200217 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
218 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200219 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200220 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000221
Serhiy Storchaka678db842013-01-26 12:16:36 +0200222#define _Py_RETURN_UNICODE_EMPTY() \
223 do { \
224 _Py_INCREF_UNICODE_EMPTY(); \
225 return unicode_empty; \
226 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000227
Victor Stinner59423e32018-11-26 13:40:01 +0100228static inline void
229unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
230 Py_ssize_t start, Py_ssize_t length)
231{
232 assert(0 <= start);
233 assert(kind != PyUnicode_WCHAR_KIND);
234 switch (kind) {
235 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100236 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100237 Py_UCS1 ch = (unsigned char)value;
238 Py_UCS1 *to = (Py_UCS1 *)data + start;
239 memset(to, ch, length);
240 break;
241 }
242 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100243 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100244 Py_UCS2 ch = (Py_UCS2)value;
245 Py_UCS2 *to = (Py_UCS2 *)data + start;
246 const Py_UCS2 *end = to + length;
247 for (; to < end; ++to) *to = ch;
248 break;
249 }
250 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100251 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100252 Py_UCS4 ch = value;
253 Py_UCS4 * to = (Py_UCS4 *)data + start;
254 const Py_UCS4 *end = to + length;
255 for (; to < end; ++to) *to = ch;
256 break;
257 }
258 default: Py_UNREACHABLE();
259 }
260}
261
262
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200263/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700264static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200265_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
266
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200267/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200268static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200269
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000270/* Single character Unicode strings in the Latin-1 range are being
271 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200272static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000273
Christian Heimes190d79e2008-01-30 11:58:22 +0000274/* Fast detection of the most frequent whitespace characters */
275const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000276 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000277/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000278/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000279/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* case 0x000C: * FORM FEED */
281/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 0, 1, 1, 1, 1, 1, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000284/* case 0x001C: * FILE SEPARATOR */
285/* case 0x001D: * GROUP SEPARATOR */
286/* case 0x001E: * RECORD SEPARATOR */
287/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000289/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000290 1, 0, 0, 0, 0, 0, 0, 0,
291 0, 0, 0, 0, 0, 0, 0, 0,
292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000294
Benjamin Peterson14339b62009-01-31 16:36:08 +0000295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0,
300 0, 0, 0, 0, 0, 0, 0, 0,
301 0, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000303};
304
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200305/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200306static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200307static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100308static int unicode_modifiable(PyObject *unicode);
309
Victor Stinnerfe226c02011-10-03 03:52:20 +0200310
Alexander Belopolsky40018472011-02-26 01:02:56 +0000311static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100312_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200313static PyObject *
314_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
315static PyObject *
316_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
317
318static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000319unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000320 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100321 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000322 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
323
Alexander Belopolsky40018472011-02-26 01:02:56 +0000324static void
325raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300326 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100327 PyObject *unicode,
328 Py_ssize_t startpos, Py_ssize_t endpos,
329 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000330
Christian Heimes190d79e2008-01-30 11:58:22 +0000331/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200332static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000333 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000334/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000335/* 0x000B, * LINE TABULATION */
336/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000337/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000338 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000339 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000340/* 0x001C, * FILE SEPARATOR */
341/* 0x001D, * GROUP SEPARATOR */
342/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000343 0, 0, 0, 0, 1, 1, 1, 0,
344 0, 0, 0, 0, 0, 0, 0, 0,
345 0, 0, 0, 0, 0, 0, 0, 0,
346 0, 0, 0, 0, 0, 0, 0, 0,
347 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000348
Benjamin Peterson14339b62009-01-31 16:36:08 +0000349 0, 0, 0, 0, 0, 0, 0, 0,
350 0, 0, 0, 0, 0, 0, 0, 0,
351 0, 0, 0, 0, 0, 0, 0, 0,
352 0, 0, 0, 0, 0, 0, 0, 0,
353 0, 0, 0, 0, 0, 0, 0, 0,
354 0, 0, 0, 0, 0, 0, 0, 0,
355 0, 0, 0, 0, 0, 0, 0, 0,
356 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000357};
358
INADA Naoki3ae20562017-01-16 20:41:20 +0900359static int convert_uc(PyObject *obj, void *addr);
360
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300361#include "clinic/unicodeobject.c.h"
362
Victor Stinner3d4226a2018-08-29 22:21:32 +0200363_Py_error_handler
364_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200365{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200366 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200367 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200368 }
369 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200370 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200371 }
372 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200373 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200374 }
375 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200376 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200377 }
378 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200379 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200380 }
381 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200382 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200383 }
384 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200385 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200386 }
Victor Stinner50149202015-09-22 00:26:54 +0200387 return _Py_ERROR_OTHER;
388}
389
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300390/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
391 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000392Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000393PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000394{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000395#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000396 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000397#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000398 /* This is actually an illegal character, so it should
399 not be passed to unichr. */
400 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000401#endif
402}
403
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200404int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100405_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200406{
407 PyASCIIObject *ascii;
408 unsigned int kind;
409
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200410 _PyObject_ASSERT(op, PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200411
412 ascii = (PyASCIIObject *)op;
413 kind = ascii->state.kind;
414
Victor Stinnera3b334d2011-10-03 13:53:37 +0200415 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200416 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND);
417 _PyObject_ASSERT(op, ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200418 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200419 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200420 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200421 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200422
Victor Stinnera41463c2011-10-04 01:05:08 +0200423 if (ascii->state.compact == 1) {
424 data = compact + 1;
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200425 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND
426 || kind == PyUnicode_2BYTE_KIND
427 || kind == PyUnicode_4BYTE_KIND);
428 _PyObject_ASSERT(op, ascii->state.ascii == 0);
429 _PyObject_ASSERT(op, ascii->state.ready == 1);
430 _PyObject_ASSERT(op, compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100431 }
432 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200433 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
434
435 data = unicode->data.any;
436 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200437 _PyObject_ASSERT(op, ascii->length == 0);
438 _PyObject_ASSERT(op, ascii->hash == -1);
439 _PyObject_ASSERT(op, ascii->state.compact == 0);
440 _PyObject_ASSERT(op, ascii->state.ascii == 0);
441 _PyObject_ASSERT(op, ascii->state.ready == 0);
442 _PyObject_ASSERT(op, ascii->state.interned == SSTATE_NOT_INTERNED);
443 _PyObject_ASSERT(op, ascii->wstr != NULL);
444 _PyObject_ASSERT(op, data == NULL);
445 _PyObject_ASSERT(op, compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200446 }
447 else {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200448 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND
449 || kind == PyUnicode_2BYTE_KIND
450 || kind == PyUnicode_4BYTE_KIND);
451 _PyObject_ASSERT(op, ascii->state.compact == 0);
452 _PyObject_ASSERT(op, ascii->state.ready == 1);
453 _PyObject_ASSERT(op, data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200454 if (ascii->state.ascii) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200455 _PyObject_ASSERT(op, compact->utf8 == data);
456 _PyObject_ASSERT(op, compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200457 }
458 else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200459 _PyObject_ASSERT(op, compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200460 }
461 }
462 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200463 if (
464#if SIZEOF_WCHAR_T == 2
465 kind == PyUnicode_2BYTE_KIND
466#else
467 kind == PyUnicode_4BYTE_KIND
468#endif
469 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200470 {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200471 _PyObject_ASSERT(op, ascii->wstr == data);
472 _PyObject_ASSERT(op, compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200473 } else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200474 _PyObject_ASSERT(op, ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200475 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200476
477 if (compact->utf8 == NULL)
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200478 _PyObject_ASSERT(op, compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200479 if (ascii->wstr == NULL)
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200480 _PyObject_ASSERT(op, compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200481 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200482
483 /* check that the best kind is used: O(n) operation */
484 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200485 Py_ssize_t i;
486 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200487 void *data;
488 Py_UCS4 ch;
489
490 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200491 for (i=0; i < ascii->length; i++)
492 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200493 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200494 if (ch > maxchar)
495 maxchar = ch;
496 }
497 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100498 if (ascii->state.ascii == 0) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200499 _PyObject_ASSERT(op, maxchar >= 128);
500 _PyObject_ASSERT(op, maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100501 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200502 else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200503 _PyObject_ASSERT(op, maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200504 }
Victor Stinner77faf692011-11-20 18:56:05 +0100505 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200506 _PyObject_ASSERT(op, maxchar >= 0x100);
507 _PyObject_ASSERT(op, maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100508 }
509 else {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200510 _PyObject_ASSERT(op, maxchar >= 0x10000);
511 _PyObject_ASSERT(op, maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100512 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200513 _PyObject_ASSERT(op, PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200514 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400515 return 1;
516}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200517
Victor Stinner910337b2011-10-03 03:20:16 +0200518
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100519static PyObject*
520unicode_result_wchar(PyObject *unicode)
521{
522#ifndef Py_DEBUG
523 Py_ssize_t len;
524
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100525 len = _PyUnicode_WSTR_LENGTH(unicode);
526 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100527 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200528 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100529 }
530
531 if (len == 1) {
532 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100533 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100534 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
535 Py_DECREF(unicode);
536 return latin1_char;
537 }
538 }
539
540 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200541 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100542 return NULL;
543 }
544#else
Victor Stinneraa771272012-10-04 02:32:58 +0200545 assert(Py_REFCNT(unicode) == 1);
546
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100547 /* don't make the result ready in debug mode to ensure that the caller
548 makes the string ready before using it */
549 assert(_PyUnicode_CheckConsistency(unicode, 1));
550#endif
551 return unicode;
552}
553
554static PyObject*
555unicode_result_ready(PyObject *unicode)
556{
557 Py_ssize_t length;
558
559 length = PyUnicode_GET_LENGTH(unicode);
560 if (length == 0) {
561 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100562 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200563 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100564 }
565 return unicode_empty;
566 }
567
568 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200569 void *data = PyUnicode_DATA(unicode);
570 int kind = PyUnicode_KIND(unicode);
571 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100572 if (ch < 256) {
573 PyObject *latin1_char = unicode_latin1[ch];
574 if (latin1_char != NULL) {
575 if (unicode != latin1_char) {
576 Py_INCREF(latin1_char);
577 Py_DECREF(unicode);
578 }
579 return latin1_char;
580 }
581 else {
582 assert(_PyUnicode_CheckConsistency(unicode, 1));
583 Py_INCREF(unicode);
584 unicode_latin1[ch] = unicode;
585 return unicode;
586 }
587 }
588 }
589
590 assert(_PyUnicode_CheckConsistency(unicode, 1));
591 return unicode;
592}
593
594static PyObject*
595unicode_result(PyObject *unicode)
596{
597 assert(_PyUnicode_CHECK(unicode));
598 if (PyUnicode_IS_READY(unicode))
599 return unicode_result_ready(unicode);
600 else
601 return unicode_result_wchar(unicode);
602}
603
Victor Stinnerc4b49542011-12-11 22:44:26 +0100604static PyObject*
605unicode_result_unchanged(PyObject *unicode)
606{
607 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500608 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100609 return NULL;
610 Py_INCREF(unicode);
611 return unicode;
612 }
613 else
614 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100615 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100616}
617
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200618/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
619 ASCII, Latin1, UTF-8, etc. */
620static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200621backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200622 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
623{
Victor Stinnerad771582015-10-09 12:38:53 +0200624 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200625 Py_UCS4 ch;
626 enum PyUnicode_Kind kind;
627 void *data;
628
629 assert(PyUnicode_IS_READY(unicode));
630 kind = PyUnicode_KIND(unicode);
631 data = PyUnicode_DATA(unicode);
632
633 size = 0;
634 /* determine replacement size */
635 for (i = collstart; i < collend; ++i) {
636 Py_ssize_t incr;
637
638 ch = PyUnicode_READ(kind, data, i);
639 if (ch < 0x100)
640 incr = 2+2;
641 else if (ch < 0x10000)
642 incr = 2+4;
643 else {
644 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200645 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200646 }
647 if (size > PY_SSIZE_T_MAX - incr) {
648 PyErr_SetString(PyExc_OverflowError,
649 "encoded result is too long for a Python string");
650 return NULL;
651 }
652 size += incr;
653 }
654
Victor Stinnerad771582015-10-09 12:38:53 +0200655 str = _PyBytesWriter_Prepare(writer, str, size);
656 if (str == NULL)
657 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200658
659 /* generate replacement */
660 for (i = collstart; i < collend; ++i) {
661 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200662 *str++ = '\\';
663 if (ch >= 0x00010000) {
664 *str++ = 'U';
665 *str++ = Py_hexdigits[(ch>>28)&0xf];
666 *str++ = Py_hexdigits[(ch>>24)&0xf];
667 *str++ = Py_hexdigits[(ch>>20)&0xf];
668 *str++ = Py_hexdigits[(ch>>16)&0xf];
669 *str++ = Py_hexdigits[(ch>>12)&0xf];
670 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200671 }
Victor Stinner797485e2015-10-09 03:17:30 +0200672 else if (ch >= 0x100) {
673 *str++ = 'u';
674 *str++ = Py_hexdigits[(ch>>12)&0xf];
675 *str++ = Py_hexdigits[(ch>>8)&0xf];
676 }
677 else
678 *str++ = 'x';
679 *str++ = Py_hexdigits[(ch>>4)&0xf];
680 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200681 }
682 return str;
683}
684
685/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
686 ASCII, Latin1, UTF-8, etc. */
687static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200688xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200689 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
690{
Victor Stinnerad771582015-10-09 12:38:53 +0200691 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200692 Py_UCS4 ch;
693 enum PyUnicode_Kind kind;
694 void *data;
695
696 assert(PyUnicode_IS_READY(unicode));
697 kind = PyUnicode_KIND(unicode);
698 data = PyUnicode_DATA(unicode);
699
700 size = 0;
701 /* determine replacement size */
702 for (i = collstart; i < collend; ++i) {
703 Py_ssize_t incr;
704
705 ch = PyUnicode_READ(kind, data, i);
706 if (ch < 10)
707 incr = 2+1+1;
708 else if (ch < 100)
709 incr = 2+2+1;
710 else if (ch < 1000)
711 incr = 2+3+1;
712 else if (ch < 10000)
713 incr = 2+4+1;
714 else if (ch < 100000)
715 incr = 2+5+1;
716 else if (ch < 1000000)
717 incr = 2+6+1;
718 else {
719 assert(ch <= MAX_UNICODE);
720 incr = 2+7+1;
721 }
722 if (size > PY_SSIZE_T_MAX - incr) {
723 PyErr_SetString(PyExc_OverflowError,
724 "encoded result is too long for a Python string");
725 return NULL;
726 }
727 size += incr;
728 }
729
Victor Stinnerad771582015-10-09 12:38:53 +0200730 str = _PyBytesWriter_Prepare(writer, str, size);
731 if (str == NULL)
732 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200733
734 /* generate replacement */
735 for (i = collstart; i < collend; ++i) {
736 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
737 }
738 return str;
739}
740
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741/* --- Bloom Filters ----------------------------------------------------- */
742
743/* stuff to implement simple "bloom filters" for Unicode characters.
744 to keep things simple, we use a single bitmask, using the least 5
745 bits from each unicode characters as the bit index. */
746
747/* the linebreak mask is set up by Unicode_Init below */
748
Antoine Pitrouf068f942010-01-13 14:19:12 +0000749#if LONG_BIT >= 128
750#define BLOOM_WIDTH 128
751#elif LONG_BIT >= 64
752#define BLOOM_WIDTH 64
753#elif LONG_BIT >= 32
754#define BLOOM_WIDTH 32
755#else
756#error "LONG_BIT is smaller than 32"
757#endif
758
Thomas Wouters477c8d52006-05-27 19:21:47 +0000759#define BLOOM_MASK unsigned long
760
Serhiy Storchaka05997252013-01-26 12:14:02 +0200761static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000762
Antoine Pitrouf068f942010-01-13 14:19:12 +0000763#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000764
Benjamin Peterson29060642009-01-31 22:14:21 +0000765#define BLOOM_LINEBREAK(ch) \
766 ((ch) < 128U ? ascii_linebreak[(ch)] : \
767 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000768
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700769static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200770make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000771{
Victor Stinnera85af502013-04-09 21:53:54 +0200772#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
773 do { \
774 TYPE *data = (TYPE *)PTR; \
775 TYPE *end = data + LEN; \
776 Py_UCS4 ch; \
777 for (; data != end; data++) { \
778 ch = *data; \
779 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
780 } \
781 break; \
782 } while (0)
783
Thomas Wouters477c8d52006-05-27 19:21:47 +0000784 /* calculate simple bloom-style bitmask for a given unicode string */
785
Antoine Pitrouf068f942010-01-13 14:19:12 +0000786 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000787
788 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200789 switch (kind) {
790 case PyUnicode_1BYTE_KIND:
791 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
792 break;
793 case PyUnicode_2BYTE_KIND:
794 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
795 break;
796 case PyUnicode_4BYTE_KIND:
797 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
798 break;
799 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700800 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200801 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000802 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200803
804#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000805}
806
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300807static int
808ensure_unicode(PyObject *obj)
809{
810 if (!PyUnicode_Check(obj)) {
811 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200812 "must be str, not %.100s",
813 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300814 return -1;
815 }
816 return PyUnicode_READY(obj);
817}
818
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200819/* Compilation of templated routines */
820
821#include "stringlib/asciilib.h"
822#include "stringlib/fastsearch.h"
823#include "stringlib/partition.h"
824#include "stringlib/split.h"
825#include "stringlib/count.h"
826#include "stringlib/find.h"
827#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200828#include "stringlib/undef.h"
829
830#include "stringlib/ucs1lib.h"
831#include "stringlib/fastsearch.h"
832#include "stringlib/partition.h"
833#include "stringlib/split.h"
834#include "stringlib/count.h"
835#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300836#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200837#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200838#include "stringlib/undef.h"
839
840#include "stringlib/ucs2lib.h"
841#include "stringlib/fastsearch.h"
842#include "stringlib/partition.h"
843#include "stringlib/split.h"
844#include "stringlib/count.h"
845#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300846#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200847#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200848#include "stringlib/undef.h"
849
850#include "stringlib/ucs4lib.h"
851#include "stringlib/fastsearch.h"
852#include "stringlib/partition.h"
853#include "stringlib/split.h"
854#include "stringlib/count.h"
855#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300856#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200857#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200858#include "stringlib/undef.h"
859
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200860#include "stringlib/unicodedefs.h"
861#include "stringlib/fastsearch.h"
862#include "stringlib/count.h"
863#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100864#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200865
Guido van Rossumd57fd912000-03-10 22:53:23 +0000866/* --- Unicode Object ----------------------------------------------------- */
867
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700868static inline Py_ssize_t
869findchar(const void *s, int kind,
870 Py_ssize_t size, Py_UCS4 ch,
871 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200872{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200873 switch (kind) {
874 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200875 if ((Py_UCS1) ch != ch)
876 return -1;
877 if (direction > 0)
878 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
879 else
880 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200881 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200882 if ((Py_UCS2) ch != ch)
883 return -1;
884 if (direction > 0)
885 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
886 else
887 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200888 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200889 if (direction > 0)
890 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
891 else
892 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200893 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700894 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200895 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200896}
897
Victor Stinnerafffce42012-10-03 23:03:17 +0200898#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000899/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200900 earlier.
901
902 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
903 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
904 invalid character in Unicode 6.0. */
905static void
906unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
907{
908 int kind = PyUnicode_KIND(unicode);
909 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
910 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
911 if (length <= old_length)
912 return;
913 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
914}
915#endif
916
Victor Stinnerfe226c02011-10-03 03:52:20 +0200917static PyObject*
918resize_compact(PyObject *unicode, Py_ssize_t length)
919{
920 Py_ssize_t char_size;
921 Py_ssize_t struct_size;
922 Py_ssize_t new_size;
923 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100924 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200925#ifdef Py_DEBUG
926 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
927#endif
928
Victor Stinner79891572012-05-03 13:43:07 +0200929 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200930 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100931 assert(PyUnicode_IS_COMPACT(unicode));
932
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200933 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100934 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200935 struct_size = sizeof(PyASCIIObject);
936 else
937 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200938 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200939
Victor Stinnerfe226c02011-10-03 03:52:20 +0200940 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
941 PyErr_NoMemory();
942 return NULL;
943 }
944 new_size = (struct_size + (length + 1) * char_size);
945
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200946 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
947 PyObject_DEL(_PyUnicode_UTF8(unicode));
948 _PyUnicode_UTF8(unicode) = NULL;
949 _PyUnicode_UTF8_LENGTH(unicode) = 0;
950 }
Victor Stinner84def372011-12-11 20:04:56 +0100951 _Py_DEC_REFTOTAL;
952 _Py_ForgetReference(unicode);
953
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300954 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100955 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100956 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200957 PyErr_NoMemory();
958 return NULL;
959 }
Victor Stinner84def372011-12-11 20:04:56 +0100960 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100962
Victor Stinnerfe226c02011-10-03 03:52:20 +0200963 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200964 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200965 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100966 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200967 _PyUnicode_WSTR_LENGTH(unicode) = length;
968 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100969 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
970 PyObject_DEL(_PyUnicode_WSTR(unicode));
971 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100972 if (!PyUnicode_IS_ASCII(unicode))
973 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100974 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200975#ifdef Py_DEBUG
976 unicode_fill_invalid(unicode, old_length);
977#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200978 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
979 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200980 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200981 return unicode;
982}
983
Alexander Belopolsky40018472011-02-26 01:02:56 +0000984static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200985resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000986{
Victor Stinner95663112011-10-04 01:03:50 +0200987 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100988 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200989 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200990 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000991
Victor Stinnerfe226c02011-10-03 03:52:20 +0200992 if (PyUnicode_IS_READY(unicode)) {
993 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200994 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200995 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200996#ifdef Py_DEBUG
997 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
998#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200999
1000 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001001 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001002 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1003 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001004
1005 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1006 PyErr_NoMemory();
1007 return -1;
1008 }
1009 new_size = (length + 1) * char_size;
1010
Victor Stinner7a9105a2011-12-12 00:13:42 +01001011 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1012 {
1013 PyObject_DEL(_PyUnicode_UTF8(unicode));
1014 _PyUnicode_UTF8(unicode) = NULL;
1015 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1016 }
1017
Victor Stinnerfe226c02011-10-03 03:52:20 +02001018 data = (PyObject *)PyObject_REALLOC(data, new_size);
1019 if (data == NULL) {
1020 PyErr_NoMemory();
1021 return -1;
1022 }
1023 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001024 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001025 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001026 _PyUnicode_WSTR_LENGTH(unicode) = length;
1027 }
1028 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001029 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001030 _PyUnicode_UTF8_LENGTH(unicode) = length;
1031 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001032 _PyUnicode_LENGTH(unicode) = length;
1033 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001034#ifdef Py_DEBUG
1035 unicode_fill_invalid(unicode, old_length);
1036#endif
Victor Stinner95663112011-10-04 01:03:50 +02001037 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001038 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001039 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001040 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041 }
Victor Stinner95663112011-10-04 01:03:50 +02001042 assert(_PyUnicode_WSTR(unicode) != NULL);
1043
1044 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001045 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001046 PyErr_NoMemory();
1047 return -1;
1048 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001049 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001050 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001051 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001052 if (!wstr) {
1053 PyErr_NoMemory();
1054 return -1;
1055 }
1056 _PyUnicode_WSTR(unicode) = wstr;
1057 _PyUnicode_WSTR(unicode)[length] = 0;
1058 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001059 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 return 0;
1061}
1062
Victor Stinnerfe226c02011-10-03 03:52:20 +02001063static PyObject*
1064resize_copy(PyObject *unicode, Py_ssize_t length)
1065{
1066 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001067 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001068 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001069
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001070 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001071
1072 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1073 if (copy == NULL)
1074 return NULL;
1075
1076 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001077 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001079 }
1080 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001081 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001082
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001083 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001084 if (w == NULL)
1085 return NULL;
1086 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1087 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001088 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001089 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001090 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001091 }
1092}
1093
Guido van Rossumd57fd912000-03-10 22:53:23 +00001094/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001095 Ux0000 terminated; some code (e.g. new_identifier)
1096 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001097
1098 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001099 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100
1101*/
1102
Alexander Belopolsky40018472011-02-26 01:02:56 +00001103static PyUnicodeObject *
1104_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001105{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001106 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108
Thomas Wouters477c8d52006-05-27 19:21:47 +00001109 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110 if (length == 0 && unicode_empty != NULL) {
1111 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001112 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113 }
1114
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001115 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001116 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001117 return (PyUnicodeObject *)PyErr_NoMemory();
1118 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119 if (length < 0) {
1120 PyErr_SetString(PyExc_SystemError,
1121 "Negative size passed to _PyUnicode_New");
1122 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123 }
1124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1126 if (unicode == NULL)
1127 return NULL;
1128 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001129
1130 _PyUnicode_WSTR_LENGTH(unicode) = length;
1131 _PyUnicode_HASH(unicode) = -1;
1132 _PyUnicode_STATE(unicode).interned = 0;
1133 _PyUnicode_STATE(unicode).kind = 0;
1134 _PyUnicode_STATE(unicode).compact = 0;
1135 _PyUnicode_STATE(unicode).ready = 0;
1136 _PyUnicode_STATE(unicode).ascii = 0;
1137 _PyUnicode_DATA_ANY(unicode) = NULL;
1138 _PyUnicode_LENGTH(unicode) = 0;
1139 _PyUnicode_UTF8(unicode) = NULL;
1140 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1143 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001144 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001145 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001146 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001147 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148
Jeremy Hyltond8082792003-09-16 19:41:39 +00001149 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001150 * the caller fails before initializing str -- unicode_resize()
1151 * reads str[0], and the Keep-Alive optimization can keep memory
1152 * allocated for str alive across a call to unicode_dealloc(unicode).
1153 * We don't want unicode_resize to read uninitialized memory in
1154 * that case.
1155 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001156 _PyUnicode_WSTR(unicode)[0] = 0;
1157 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001158
Victor Stinner7931d9a2011-11-04 00:22:48 +01001159 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001160 return unicode;
1161}
1162
Victor Stinnerf42dc442011-10-02 23:33:16 +02001163static const char*
1164unicode_kind_name(PyObject *unicode)
1165{
Victor Stinner42dfd712011-10-03 14:41:45 +02001166 /* don't check consistency: unicode_kind_name() is called from
1167 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001168 if (!PyUnicode_IS_COMPACT(unicode))
1169 {
1170 if (!PyUnicode_IS_READY(unicode))
1171 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001172 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001173 {
1174 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001175 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001176 return "legacy ascii";
1177 else
1178 return "legacy latin1";
1179 case PyUnicode_2BYTE_KIND:
1180 return "legacy UCS2";
1181 case PyUnicode_4BYTE_KIND:
1182 return "legacy UCS4";
1183 default:
1184 return "<legacy invalid kind>";
1185 }
1186 }
1187 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001188 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001189 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001190 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001191 return "ascii";
1192 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001193 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001194 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001195 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001196 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001197 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001198 default:
1199 return "<invalid compact kind>";
1200 }
1201}
1202
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001203#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001204/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001205char *_PyUnicode_utf8(void *unicode_raw){
1206 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001207 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001208}
1209
Victor Stinnera42de742018-11-22 10:25:22 +01001210void *_PyUnicode_compact_data(void *unicode_raw) {
1211 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001212 return _PyUnicode_COMPACT_DATA(unicode);
1213}
Victor Stinnera42de742018-11-22 10:25:22 +01001214void *_PyUnicode_data(void *unicode_raw) {
1215 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001216 printf("obj %p\n", unicode);
1217 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1218 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1219 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1220 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1221 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1222 return PyUnicode_DATA(unicode);
1223}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001224
1225void
1226_PyUnicode_Dump(PyObject *op)
1227{
1228 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001229 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1230 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1231 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001232
Victor Stinnera849a4b2011-10-03 12:12:11 +02001233 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001234 {
1235 if (ascii->state.ascii)
1236 data = (ascii + 1);
1237 else
1238 data = (compact + 1);
1239 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001240 else
1241 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001242 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1243 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001244
Victor Stinnera849a4b2011-10-03 12:12:11 +02001245 if (ascii->wstr == data)
1246 printf("shared ");
1247 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001248
Victor Stinnera3b334d2011-10-03 13:53:37 +02001249 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001250 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001251 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1252 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001253 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1254 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001255 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001256 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001257}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001258#endif
1259
1260PyObject *
1261PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1262{
1263 PyObject *obj;
1264 PyCompactUnicodeObject *unicode;
1265 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001266 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001267 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001268 Py_ssize_t char_size;
1269 Py_ssize_t struct_size;
1270
1271 /* Optimization for empty strings */
1272 if (size == 0 && unicode_empty != NULL) {
1273 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001274 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001275 }
1276
Victor Stinner9e9d6892011-10-04 01:02:02 +02001277 is_ascii = 0;
1278 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001279 struct_size = sizeof(PyCompactUnicodeObject);
1280 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001281 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001282 char_size = 1;
1283 is_ascii = 1;
1284 struct_size = sizeof(PyASCIIObject);
1285 }
1286 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001287 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001288 char_size = 1;
1289 }
1290 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001291 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001292 char_size = 2;
1293 if (sizeof(wchar_t) == 2)
1294 is_sharing = 1;
1295 }
1296 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001297 if (maxchar > MAX_UNICODE) {
1298 PyErr_SetString(PyExc_SystemError,
1299 "invalid maximum character passed to PyUnicode_New");
1300 return NULL;
1301 }
Victor Stinner8f825062012-04-27 13:55:39 +02001302 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001303 char_size = 4;
1304 if (sizeof(wchar_t) == 4)
1305 is_sharing = 1;
1306 }
1307
1308 /* Ensure we won't overflow the size. */
1309 if (size < 0) {
1310 PyErr_SetString(PyExc_SystemError,
1311 "Negative size passed to PyUnicode_New");
1312 return NULL;
1313 }
1314 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1315 return PyErr_NoMemory();
1316
1317 /* Duplicated allocation code from _PyObject_New() instead of a call to
1318 * PyObject_New() so we are able to allocate space for the object and
1319 * it's data buffer.
1320 */
1321 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1322 if (obj == NULL)
1323 return PyErr_NoMemory();
1324 obj = PyObject_INIT(obj, &PyUnicode_Type);
1325 if (obj == NULL)
1326 return NULL;
1327
1328 unicode = (PyCompactUnicodeObject *)obj;
1329 if (is_ascii)
1330 data = ((PyASCIIObject*)obj) + 1;
1331 else
1332 data = unicode + 1;
1333 _PyUnicode_LENGTH(unicode) = size;
1334 _PyUnicode_HASH(unicode) = -1;
1335 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001336 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337 _PyUnicode_STATE(unicode).compact = 1;
1338 _PyUnicode_STATE(unicode).ready = 1;
1339 _PyUnicode_STATE(unicode).ascii = is_ascii;
1340 if (is_ascii) {
1341 ((char*)data)[size] = 0;
1342 _PyUnicode_WSTR(unicode) = NULL;
1343 }
Victor Stinner8f825062012-04-27 13:55:39 +02001344 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001345 ((char*)data)[size] = 0;
1346 _PyUnicode_WSTR(unicode) = NULL;
1347 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001348 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001349 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001350 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001351 else {
1352 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001353 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001354 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001355 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001356 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357 ((Py_UCS4*)data)[size] = 0;
1358 if (is_sharing) {
1359 _PyUnicode_WSTR_LENGTH(unicode) = size;
1360 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1361 }
1362 else {
1363 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1364 _PyUnicode_WSTR(unicode) = NULL;
1365 }
1366 }
Victor Stinner8f825062012-04-27 13:55:39 +02001367#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001368 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001369#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001370 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371 return obj;
1372}
1373
1374#if SIZEOF_WCHAR_T == 2
1375/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1376 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001377 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001378
1379 This function assumes that unicode can hold one more code point than wstr
1380 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001381static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001383 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384{
1385 const wchar_t *iter;
1386 Py_UCS4 *ucs4_out;
1387
Victor Stinner910337b2011-10-03 03:20:16 +02001388 assert(unicode != NULL);
1389 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1391 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1392
1393 for (iter = begin; iter < end; ) {
1394 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1395 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001396 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1397 && (iter+1) < end
1398 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 {
Victor Stinner551ac952011-11-29 22:58:13 +01001400 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401 iter += 2;
1402 }
1403 else {
1404 *ucs4_out++ = *iter;
1405 iter++;
1406 }
1407 }
1408 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1409 _PyUnicode_GET_LENGTH(unicode)));
1410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411}
1412#endif
1413
Victor Stinnercd9950f2011-10-02 00:34:53 +02001414static int
Victor Stinner488fa492011-12-12 00:01:39 +01001415unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001416{
Victor Stinner488fa492011-12-12 00:01:39 +01001417 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001418 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001419 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001420 return -1;
1421 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001422 return 0;
1423}
1424
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001425static int
1426_copy_characters(PyObject *to, Py_ssize_t to_start,
1427 PyObject *from, Py_ssize_t from_start,
1428 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001429{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001430 unsigned int from_kind, to_kind;
1431 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001432
Victor Stinneree4544c2012-05-09 22:24:08 +02001433 assert(0 <= how_many);
1434 assert(0 <= from_start);
1435 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001436 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001437 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001438 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439
Victor Stinnerd3f08822012-05-29 12:57:52 +02001440 assert(PyUnicode_Check(to));
1441 assert(PyUnicode_IS_READY(to));
1442 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1443
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001444 if (how_many == 0)
1445 return 0;
1446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001448 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001449 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001450 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001451
Victor Stinnerf1852262012-06-16 16:38:26 +02001452#ifdef Py_DEBUG
1453 if (!check_maxchar
1454 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1455 {
1456 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1457 Py_UCS4 ch;
1458 Py_ssize_t i;
1459 for (i=0; i < how_many; i++) {
1460 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1461 assert(ch <= to_maxchar);
1462 }
1463 }
1464#endif
1465
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001466 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001467 if (check_maxchar
1468 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1469 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001470 /* Writing Latin-1 characters into an ASCII string requires to
1471 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001472 Py_UCS4 max_char;
1473 max_char = ucs1lib_find_max_char(from_data,
1474 (Py_UCS1*)from_data + how_many);
1475 if (max_char >= 128)
1476 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001477 }
Christian Heimesf051e432016-09-13 20:22:02 +02001478 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001479 (char*)from_data + from_kind * from_start,
1480 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001482 else if (from_kind == PyUnicode_1BYTE_KIND
1483 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001484 {
1485 _PyUnicode_CONVERT_BYTES(
1486 Py_UCS1, Py_UCS2,
1487 PyUnicode_1BYTE_DATA(from) + from_start,
1488 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1489 PyUnicode_2BYTE_DATA(to) + to_start
1490 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001491 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001492 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001493 && to_kind == PyUnicode_4BYTE_KIND)
1494 {
1495 _PyUnicode_CONVERT_BYTES(
1496 Py_UCS1, Py_UCS4,
1497 PyUnicode_1BYTE_DATA(from) + from_start,
1498 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1499 PyUnicode_4BYTE_DATA(to) + to_start
1500 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001501 }
1502 else if (from_kind == PyUnicode_2BYTE_KIND
1503 && to_kind == PyUnicode_4BYTE_KIND)
1504 {
1505 _PyUnicode_CONVERT_BYTES(
1506 Py_UCS2, Py_UCS4,
1507 PyUnicode_2BYTE_DATA(from) + from_start,
1508 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1509 PyUnicode_4BYTE_DATA(to) + to_start
1510 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001511 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001512 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001513 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1514
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001515 if (!check_maxchar) {
1516 if (from_kind == PyUnicode_2BYTE_KIND
1517 && to_kind == PyUnicode_1BYTE_KIND)
1518 {
1519 _PyUnicode_CONVERT_BYTES(
1520 Py_UCS2, Py_UCS1,
1521 PyUnicode_2BYTE_DATA(from) + from_start,
1522 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1523 PyUnicode_1BYTE_DATA(to) + to_start
1524 );
1525 }
1526 else if (from_kind == PyUnicode_4BYTE_KIND
1527 && to_kind == PyUnicode_1BYTE_KIND)
1528 {
1529 _PyUnicode_CONVERT_BYTES(
1530 Py_UCS4, Py_UCS1,
1531 PyUnicode_4BYTE_DATA(from) + from_start,
1532 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1533 PyUnicode_1BYTE_DATA(to) + to_start
1534 );
1535 }
1536 else if (from_kind == PyUnicode_4BYTE_KIND
1537 && to_kind == PyUnicode_2BYTE_KIND)
1538 {
1539 _PyUnicode_CONVERT_BYTES(
1540 Py_UCS4, Py_UCS2,
1541 PyUnicode_4BYTE_DATA(from) + from_start,
1542 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1543 PyUnicode_2BYTE_DATA(to) + to_start
1544 );
1545 }
1546 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001547 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001548 }
1549 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001550 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001551 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001552 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001553 Py_ssize_t i;
1554
Victor Stinnera0702ab2011-09-29 14:14:38 +02001555 for (i=0; i < how_many; i++) {
1556 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001557 if (ch > to_maxchar)
1558 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001559 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1560 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001561 }
1562 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001563 return 0;
1564}
1565
Victor Stinnerd3f08822012-05-29 12:57:52 +02001566void
1567_PyUnicode_FastCopyCharacters(
1568 PyObject *to, Py_ssize_t to_start,
1569 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001570{
1571 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1572}
1573
1574Py_ssize_t
1575PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1576 PyObject *from, Py_ssize_t from_start,
1577 Py_ssize_t how_many)
1578{
1579 int err;
1580
1581 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1582 PyErr_BadInternalCall();
1583 return -1;
1584 }
1585
Benjamin Petersonbac79492012-01-14 13:34:47 -05001586 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001587 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001588 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001589 return -1;
1590
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001591 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001592 PyErr_SetString(PyExc_IndexError, "string index out of range");
1593 return -1;
1594 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001595 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001596 PyErr_SetString(PyExc_IndexError, "string index out of range");
1597 return -1;
1598 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001599 if (how_many < 0) {
1600 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1601 return -1;
1602 }
1603 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001604 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1605 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001606 "Cannot write %zi characters at %zi "
1607 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001608 how_many, to_start, PyUnicode_GET_LENGTH(to));
1609 return -1;
1610 }
1611
1612 if (how_many == 0)
1613 return 0;
1614
Victor Stinner488fa492011-12-12 00:01:39 +01001615 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001616 return -1;
1617
1618 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1619 if (err) {
1620 PyErr_Format(PyExc_SystemError,
1621 "Cannot copy %s characters "
1622 "into a string of %s characters",
1623 unicode_kind_name(from),
1624 unicode_kind_name(to));
1625 return -1;
1626 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001627 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001628}
1629
Victor Stinner17222162011-09-28 22:15:37 +02001630/* Find the maximum code point and count the number of surrogate pairs so a
1631 correct string length can be computed before converting a string to UCS4.
1632 This function counts single surrogates as a character and not as a pair.
1633
1634 Return 0 on success, or -1 on error. */
1635static int
1636find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1637 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638{
1639 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001640 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001641
Victor Stinnerc53be962011-10-02 21:33:54 +02001642 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001643 *num_surrogates = 0;
1644 *maxchar = 0;
1645
1646 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001647#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001648 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1649 && (iter+1) < end
1650 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1651 {
1652 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1653 ++(*num_surrogates);
1654 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001655 }
1656 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001658 {
1659 ch = *iter;
1660 iter++;
1661 }
1662 if (ch > *maxchar) {
1663 *maxchar = ch;
1664 if (*maxchar > MAX_UNICODE) {
1665 PyErr_Format(PyExc_ValueError,
1666 "character U+%x is not in range [U+0000; U+10ffff]",
1667 ch);
1668 return -1;
1669 }
1670 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001671 }
1672 return 0;
1673}
1674
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001675int
1676_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001677{
1678 wchar_t *end;
1679 Py_UCS4 maxchar = 0;
1680 Py_ssize_t num_surrogates;
1681#if SIZEOF_WCHAR_T == 2
1682 Py_ssize_t length_wo_surrogates;
1683#endif
1684
Georg Brandl7597add2011-10-05 16:36:47 +02001685 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001686 strings were created using _PyObject_New() and where no canonical
1687 representation (the str field) has been set yet aka strings
1688 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001689 assert(_PyUnicode_CHECK(unicode));
1690 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001691 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001692 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001693 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001694 /* Actually, it should neither be interned nor be anything else: */
1695 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001697 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001698 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001699 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001701
1702 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001703 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1704 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001705 PyErr_NoMemory();
1706 return -1;
1707 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001708 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001709 _PyUnicode_WSTR(unicode), end,
1710 PyUnicode_1BYTE_DATA(unicode));
1711 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1712 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1713 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1714 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001715 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001716 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001717 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001718 }
1719 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001720 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001721 _PyUnicode_UTF8(unicode) = NULL;
1722 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001723 }
1724 PyObject_FREE(_PyUnicode_WSTR(unicode));
1725 _PyUnicode_WSTR(unicode) = NULL;
1726 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1727 }
1728 /* In this case we might have to convert down from 4-byte native
1729 wchar_t to 2-byte unicode. */
1730 else if (maxchar < 65536) {
1731 assert(num_surrogates == 0 &&
1732 "FindMaxCharAndNumSurrogatePairs() messed up");
1733
Victor Stinner506f5922011-09-28 22:34:18 +02001734#if SIZEOF_WCHAR_T == 2
1735 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001736 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001737 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1738 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1739 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001740 _PyUnicode_UTF8(unicode) = NULL;
1741 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001742#else
1743 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001744 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001745 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001746 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001747 PyErr_NoMemory();
1748 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 }
Victor Stinner506f5922011-09-28 22:34:18 +02001750 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1751 _PyUnicode_WSTR(unicode), end,
1752 PyUnicode_2BYTE_DATA(unicode));
1753 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1754 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1755 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001756 _PyUnicode_UTF8(unicode) = NULL;
1757 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001758 PyObject_FREE(_PyUnicode_WSTR(unicode));
1759 _PyUnicode_WSTR(unicode) = NULL;
1760 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1761#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 }
1763 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1764 else {
1765#if SIZEOF_WCHAR_T == 2
1766 /* in case the native representation is 2-bytes, we need to allocate a
1767 new normalized 4-byte version. */
1768 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001769 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1770 PyErr_NoMemory();
1771 return -1;
1772 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001773 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1774 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775 PyErr_NoMemory();
1776 return -1;
1777 }
1778 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1779 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001780 _PyUnicode_UTF8(unicode) = NULL;
1781 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001782 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1783 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001784 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001785 PyObject_FREE(_PyUnicode_WSTR(unicode));
1786 _PyUnicode_WSTR(unicode) = NULL;
1787 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1788#else
1789 assert(num_surrogates == 0);
1790
Victor Stinnerc3c74152011-10-02 20:39:55 +02001791 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001792 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001793 _PyUnicode_UTF8(unicode) = NULL;
1794 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001795 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1796#endif
1797 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1798 }
1799 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001800 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001801 return 0;
1802}
1803
Alexander Belopolsky40018472011-02-26 01:02:56 +00001804static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001805unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806{
Walter Dörwald16807132007-05-25 13:52:07 +00001807 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001808 case SSTATE_NOT_INTERNED:
1809 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001810
Benjamin Peterson29060642009-01-31 22:14:21 +00001811 case SSTATE_INTERNED_MORTAL:
1812 /* revive dead object temporarily for DelItem */
1813 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001814 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001815 Py_FatalError(
1816 "deletion of interned string failed");
1817 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001818
Benjamin Peterson29060642009-01-31 22:14:21 +00001819 case SSTATE_INTERNED_IMMORTAL:
1820 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001821 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001822
Benjamin Peterson29060642009-01-31 22:14:21 +00001823 default:
1824 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001825 }
1826
Victor Stinner03490912011-10-03 23:45:12 +02001827 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001828 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001829 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001830 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001831 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1832 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001833
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001834 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835}
1836
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001837#ifdef Py_DEBUG
1838static int
1839unicode_is_singleton(PyObject *unicode)
1840{
1841 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1842 if (unicode == unicode_empty)
1843 return 1;
1844 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1845 {
1846 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1847 if (ch < 256 && unicode_latin1[ch] == unicode)
1848 return 1;
1849 }
1850 return 0;
1851}
1852#endif
1853
Alexander Belopolsky40018472011-02-26 01:02:56 +00001854static int
Victor Stinner488fa492011-12-12 00:01:39 +01001855unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001856{
Victor Stinner488fa492011-12-12 00:01:39 +01001857 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001858 if (Py_REFCNT(unicode) != 1)
1859 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001860 if (_PyUnicode_HASH(unicode) != -1)
1861 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001862 if (PyUnicode_CHECK_INTERNED(unicode))
1863 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001864 if (!PyUnicode_CheckExact(unicode))
1865 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001866#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001867 /* singleton refcount is greater than 1 */
1868 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001869#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001870 return 1;
1871}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001872
Victor Stinnerfe226c02011-10-03 03:52:20 +02001873static int
1874unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1875{
1876 PyObject *unicode;
1877 Py_ssize_t old_length;
1878
1879 assert(p_unicode != NULL);
1880 unicode = *p_unicode;
1881
1882 assert(unicode != NULL);
1883 assert(PyUnicode_Check(unicode));
1884 assert(0 <= length);
1885
Victor Stinner910337b2011-10-03 03:20:16 +02001886 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001887 old_length = PyUnicode_WSTR_LENGTH(unicode);
1888 else
1889 old_length = PyUnicode_GET_LENGTH(unicode);
1890 if (old_length == length)
1891 return 0;
1892
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001893 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001894 _Py_INCREF_UNICODE_EMPTY();
1895 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001896 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001897 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001898 return 0;
1899 }
1900
Victor Stinner488fa492011-12-12 00:01:39 +01001901 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001902 PyObject *copy = resize_copy(unicode, length);
1903 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001904 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001905 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001906 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001907 }
1908
Victor Stinnerfe226c02011-10-03 03:52:20 +02001909 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001910 PyObject *new_unicode = resize_compact(unicode, length);
1911 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001912 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001913 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001914 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001915 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001916 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001917}
1918
Alexander Belopolsky40018472011-02-26 01:02:56 +00001919int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001920PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001921{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001922 PyObject *unicode;
1923 if (p_unicode == NULL) {
1924 PyErr_BadInternalCall();
1925 return -1;
1926 }
1927 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001928 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001929 {
1930 PyErr_BadInternalCall();
1931 return -1;
1932 }
1933 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001934}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001935
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001936/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001937
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001938 WARNING: The function doesn't copy the terminating null character and
1939 doesn't check the maximum character (may write a latin1 character in an
1940 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001941static void
1942unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1943 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001944{
1945 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1946 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001947 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001948
1949 switch (kind) {
1950 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001951 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001952#ifdef Py_DEBUG
1953 if (PyUnicode_IS_ASCII(unicode)) {
1954 Py_UCS4 maxchar = ucs1lib_find_max_char(
1955 (const Py_UCS1*)str,
1956 (const Py_UCS1*)str + len);
1957 assert(maxchar < 128);
1958 }
1959#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001960 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001961 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001962 }
1963 case PyUnicode_2BYTE_KIND: {
1964 Py_UCS2 *start = (Py_UCS2 *)data + index;
1965 Py_UCS2 *ucs2 = start;
1966 assert(index <= PyUnicode_GET_LENGTH(unicode));
1967
Victor Stinner184252a2012-06-16 02:57:41 +02001968 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001969 *ucs2 = (Py_UCS2)*str;
1970
1971 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001972 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001973 }
1974 default: {
1975 Py_UCS4 *start = (Py_UCS4 *)data + index;
1976 Py_UCS4 *ucs4 = start;
1977 assert(kind == PyUnicode_4BYTE_KIND);
1978 assert(index <= PyUnicode_GET_LENGTH(unicode));
1979
Victor Stinner184252a2012-06-16 02:57:41 +02001980 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001981 *ucs4 = (Py_UCS4)*str;
1982
1983 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001984 }
1985 }
1986}
1987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988static PyObject*
1989get_latin1_char(unsigned char ch)
1990{
Victor Stinnera464fc12011-10-02 20:39:30 +02001991 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001993 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001994 if (!unicode)
1995 return NULL;
1996 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001997 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001998 unicode_latin1[ch] = unicode;
1999 }
2000 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002001 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002}
2003
Victor Stinner985a82a2014-01-03 12:53:47 +01002004static PyObject*
2005unicode_char(Py_UCS4 ch)
2006{
2007 PyObject *unicode;
2008
2009 assert(ch <= MAX_UNICODE);
2010
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002011 if (ch < 256)
2012 return get_latin1_char(ch);
2013
Victor Stinner985a82a2014-01-03 12:53:47 +01002014 unicode = PyUnicode_New(1, ch);
2015 if (unicode == NULL)
2016 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002017
2018 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2019 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002020 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002021 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002022 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2023 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2024 }
2025 assert(_PyUnicode_CheckConsistency(unicode, 1));
2026 return unicode;
2027}
2028
Alexander Belopolsky40018472011-02-26 01:02:56 +00002029PyObject *
2030PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002031{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002032 if (u == NULL)
2033 return (PyObject*)_PyUnicode_New(size);
2034
2035 if (size < 0) {
2036 PyErr_BadInternalCall();
2037 return NULL;
2038 }
2039
2040 return PyUnicode_FromWideChar(u, size);
2041}
2042
2043PyObject *
2044PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2045{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002046 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002047 Py_UCS4 maxchar = 0;
2048 Py_ssize_t num_surrogates;
2049
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002050 if (u == NULL && size != 0) {
2051 PyErr_BadInternalCall();
2052 return NULL;
2053 }
2054
2055 if (size == -1) {
2056 size = wcslen(u);
2057 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002059 /* If the Unicode data is known at construction time, we can apply
2060 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002061
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002062 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002063 if (size == 0)
2064 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002066 /* Single character Unicode objects in the Latin-1 range are
2067 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002068 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002069 return get_latin1_char((unsigned char)*u);
2070
2071 /* If not empty and not single character, copy the Unicode data
2072 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002073 if (find_maxchar_surrogates(u, u + size,
2074 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002075 return NULL;
2076
Victor Stinner8faf8212011-12-08 22:14:11 +01002077 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078 if (!unicode)
2079 return NULL;
2080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 switch (PyUnicode_KIND(unicode)) {
2082 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002083 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002084 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2085 break;
2086 case PyUnicode_2BYTE_KIND:
2087#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002088 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002089#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002090 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002091 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2092#endif
2093 break;
2094 case PyUnicode_4BYTE_KIND:
2095#if SIZEOF_WCHAR_T == 2
2096 /* This is the only case which has to process surrogates, thus
2097 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002098 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002099#else
2100 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002101 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002102#endif
2103 break;
2104 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002105 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002106 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002107
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002108 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109}
2110
Alexander Belopolsky40018472011-02-26 01:02:56 +00002111PyObject *
2112PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002113{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002114 if (size < 0) {
2115 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002116 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002117 return NULL;
2118 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002119 if (u != NULL)
2120 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2121 else
2122 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002123}
2124
Alexander Belopolsky40018472011-02-26 01:02:56 +00002125PyObject *
2126PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002127{
2128 size_t size = strlen(u);
2129 if (size > PY_SSIZE_T_MAX) {
2130 PyErr_SetString(PyExc_OverflowError, "input too long");
2131 return NULL;
2132 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002133 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002134}
2135
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002136PyObject *
2137_PyUnicode_FromId(_Py_Identifier *id)
2138{
2139 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002140 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2141 strlen(id->string),
2142 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002143 if (!id->object)
2144 return NULL;
2145 PyUnicode_InternInPlace(&id->object);
2146 assert(!id->next);
2147 id->next = static_strings;
2148 static_strings = id;
2149 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002150 return id->object;
2151}
2152
2153void
2154_PyUnicode_ClearStaticStrings()
2155{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002156 _Py_Identifier *tmp, *s = static_strings;
2157 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002158 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002159 tmp = s->next;
2160 s->next = NULL;
2161 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002162 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002163 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002164}
2165
Benjamin Peterson0df54292012-03-26 14:50:32 -04002166/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002167
Victor Stinnerd3f08822012-05-29 12:57:52 +02002168PyObject*
2169_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002170{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002171 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002172 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002173 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002174#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002175 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002176#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002177 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002178 }
Victor Stinner785938e2011-12-11 20:09:03 +01002179 unicode = PyUnicode_New(size, 127);
2180 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002181 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002182 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2183 assert(_PyUnicode_CheckConsistency(unicode, 1));
2184 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002185}
2186
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002187static Py_UCS4
2188kind_maxchar_limit(unsigned int kind)
2189{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002190 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002191 case PyUnicode_1BYTE_KIND:
2192 return 0x80;
2193 case PyUnicode_2BYTE_KIND:
2194 return 0x100;
2195 case PyUnicode_4BYTE_KIND:
2196 return 0x10000;
2197 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002198 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002199 }
2200}
2201
Victor Stinner702c7342011-10-05 13:50:52 +02002202static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002203_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002204{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002206 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002207
Serhiy Storchaka678db842013-01-26 12:16:36 +02002208 if (size == 0)
2209 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002210 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002211 if (size == 1)
2212 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002213
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002214 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002215 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 if (!res)
2217 return NULL;
2218 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002219 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002221}
2222
Victor Stinnere57b1c02011-09-28 22:20:48 +02002223static PyObject*
2224_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225{
2226 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002227 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002228
Serhiy Storchaka678db842013-01-26 12:16:36 +02002229 if (size == 0)
2230 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002231 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002232 if (size == 1)
2233 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002234
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002235 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002236 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002237 if (!res)
2238 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002239 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002241 else {
2242 _PyUnicode_CONVERT_BYTES(
2243 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2244 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002245 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246 return res;
2247}
2248
Victor Stinnere57b1c02011-09-28 22:20:48 +02002249static PyObject*
2250_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002251{
2252 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002253 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002254
Serhiy Storchaka678db842013-01-26 12:16:36 +02002255 if (size == 0)
2256 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002257 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002258 if (size == 1)
2259 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002260
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002261 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002262 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 if (!res)
2264 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002265 if (max_char < 256)
2266 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2267 PyUnicode_1BYTE_DATA(res));
2268 else if (max_char < 0x10000)
2269 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2270 PyUnicode_2BYTE_DATA(res));
2271 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002272 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002273 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 return res;
2275}
2276
2277PyObject*
2278PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2279{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002280 if (size < 0) {
2281 PyErr_SetString(PyExc_ValueError, "size must be positive");
2282 return NULL;
2283 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002284 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002285 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002286 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002287 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002288 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002289 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002290 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002291 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002292 PyErr_SetString(PyExc_SystemError, "invalid kind");
2293 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002294 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002295}
2296
Victor Stinnerece58de2012-04-23 23:36:38 +02002297Py_UCS4
2298_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2299{
2300 enum PyUnicode_Kind kind;
2301 void *startptr, *endptr;
2302
2303 assert(PyUnicode_IS_READY(unicode));
2304 assert(0 <= start);
2305 assert(end <= PyUnicode_GET_LENGTH(unicode));
2306 assert(start <= end);
2307
2308 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2309 return PyUnicode_MAX_CHAR_VALUE(unicode);
2310
2311 if (start == end)
2312 return 127;
2313
Victor Stinner94d558b2012-04-27 22:26:58 +02002314 if (PyUnicode_IS_ASCII(unicode))
2315 return 127;
2316
Victor Stinnerece58de2012-04-23 23:36:38 +02002317 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002318 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002319 endptr = (char *)startptr + end * kind;
2320 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002321 switch(kind) {
2322 case PyUnicode_1BYTE_KIND:
2323 return ucs1lib_find_max_char(startptr, endptr);
2324 case PyUnicode_2BYTE_KIND:
2325 return ucs2lib_find_max_char(startptr, endptr);
2326 case PyUnicode_4BYTE_KIND:
2327 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002328 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002329 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002330 }
2331}
2332
Victor Stinner25a4b292011-10-06 12:31:55 +02002333/* Ensure that a string uses the most efficient storage, if it is not the
2334 case: create a new string with of the right kind. Write NULL into *p_unicode
2335 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002336static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002337unicode_adjust_maxchar(PyObject **p_unicode)
2338{
2339 PyObject *unicode, *copy;
2340 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002341 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002342 unsigned int kind;
2343
2344 assert(p_unicode != NULL);
2345 unicode = *p_unicode;
2346 assert(PyUnicode_IS_READY(unicode));
2347 if (PyUnicode_IS_ASCII(unicode))
2348 return;
2349
2350 len = PyUnicode_GET_LENGTH(unicode);
2351 kind = PyUnicode_KIND(unicode);
2352 if (kind == PyUnicode_1BYTE_KIND) {
2353 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002354 max_char = ucs1lib_find_max_char(u, u + len);
2355 if (max_char >= 128)
2356 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002357 }
2358 else if (kind == PyUnicode_2BYTE_KIND) {
2359 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002360 max_char = ucs2lib_find_max_char(u, u + len);
2361 if (max_char >= 256)
2362 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002363 }
2364 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002365 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002366 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002367 max_char = ucs4lib_find_max_char(u, u + len);
2368 if (max_char >= 0x10000)
2369 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002370 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002371 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002372 if (copy != NULL)
2373 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002374 Py_DECREF(unicode);
2375 *p_unicode = copy;
2376}
2377
Victor Stinner034f6cf2011-09-30 02:26:44 +02002378PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002379_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002380{
Victor Stinner87af4f22011-11-21 23:03:47 +01002381 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002382 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002383
Victor Stinner034f6cf2011-09-30 02:26:44 +02002384 if (!PyUnicode_Check(unicode)) {
2385 PyErr_BadInternalCall();
2386 return NULL;
2387 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002388 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002389 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002390
Victor Stinner87af4f22011-11-21 23:03:47 +01002391 length = PyUnicode_GET_LENGTH(unicode);
2392 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002393 if (!copy)
2394 return NULL;
2395 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2396
Christian Heimesf051e432016-09-13 20:22:02 +02002397 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002398 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002399 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002400 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002401}
2402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002403
Victor Stinnerbc603d12011-10-02 01:00:40 +02002404/* Widen Unicode objects to larger buffers. Don't write terminating null
2405 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002406
2407void*
2408_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2409{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002410 Py_ssize_t len;
2411 void *result;
2412 unsigned int skind;
2413
Benjamin Petersonbac79492012-01-14 13:34:47 -05002414 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002415 return NULL;
2416
2417 len = PyUnicode_GET_LENGTH(s);
2418 skind = PyUnicode_KIND(s);
2419 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002420 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002421 return NULL;
2422 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002423 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002424 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002425 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002426 if (!result)
2427 return PyErr_NoMemory();
2428 assert(skind == PyUnicode_1BYTE_KIND);
2429 _PyUnicode_CONVERT_BYTES(
2430 Py_UCS1, Py_UCS2,
2431 PyUnicode_1BYTE_DATA(s),
2432 PyUnicode_1BYTE_DATA(s) + len,
2433 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002434 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002435 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002436 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002437 if (!result)
2438 return PyErr_NoMemory();
2439 if (skind == PyUnicode_2BYTE_KIND) {
2440 _PyUnicode_CONVERT_BYTES(
2441 Py_UCS2, Py_UCS4,
2442 PyUnicode_2BYTE_DATA(s),
2443 PyUnicode_2BYTE_DATA(s) + len,
2444 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002446 else {
2447 assert(skind == PyUnicode_1BYTE_KIND);
2448 _PyUnicode_CONVERT_BYTES(
2449 Py_UCS1, Py_UCS4,
2450 PyUnicode_1BYTE_DATA(s),
2451 PyUnicode_1BYTE_DATA(s) + len,
2452 result);
2453 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002455 default:
2456 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002457 }
Victor Stinner01698042011-10-04 00:04:26 +02002458 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002459 return NULL;
2460}
2461
2462static Py_UCS4*
2463as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2464 int copy_null)
2465{
2466 int kind;
2467 void *data;
2468 Py_ssize_t len, targetlen;
2469 if (PyUnicode_READY(string) == -1)
2470 return NULL;
2471 kind = PyUnicode_KIND(string);
2472 data = PyUnicode_DATA(string);
2473 len = PyUnicode_GET_LENGTH(string);
2474 targetlen = len;
2475 if (copy_null)
2476 targetlen++;
2477 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002478 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002479 if (!target) {
2480 PyErr_NoMemory();
2481 return NULL;
2482 }
2483 }
2484 else {
2485 if (targetsize < targetlen) {
2486 PyErr_Format(PyExc_SystemError,
2487 "string is longer than the buffer");
2488 if (copy_null && 0 < targetsize)
2489 target[0] = 0;
2490 return NULL;
2491 }
2492 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002493 if (kind == PyUnicode_1BYTE_KIND) {
2494 Py_UCS1 *start = (Py_UCS1 *) data;
2495 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002496 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002497 else if (kind == PyUnicode_2BYTE_KIND) {
2498 Py_UCS2 *start = (Py_UCS2 *) data;
2499 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2500 }
2501 else {
2502 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002503 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002504 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002505 if (copy_null)
2506 target[len] = 0;
2507 return target;
2508}
2509
2510Py_UCS4*
2511PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2512 int copy_null)
2513{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002514 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002515 PyErr_BadInternalCall();
2516 return NULL;
2517 }
2518 return as_ucs4(string, target, targetsize, copy_null);
2519}
2520
2521Py_UCS4*
2522PyUnicode_AsUCS4Copy(PyObject *string)
2523{
2524 return as_ucs4(string, NULL, 0, 1);
2525}
2526
Victor Stinner15a11362012-10-06 23:48:20 +02002527/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002528 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2529 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2530#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002531
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002532static int
2533unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2534 Py_ssize_t width, Py_ssize_t precision)
2535{
2536 Py_ssize_t length, fill, arglen;
2537 Py_UCS4 maxchar;
2538
2539 if (PyUnicode_READY(str) == -1)
2540 return -1;
2541
2542 length = PyUnicode_GET_LENGTH(str);
2543 if ((precision == -1 || precision >= length)
2544 && width <= length)
2545 return _PyUnicodeWriter_WriteStr(writer, str);
2546
2547 if (precision != -1)
2548 length = Py_MIN(precision, length);
2549
2550 arglen = Py_MAX(length, width);
2551 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2552 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2553 else
2554 maxchar = writer->maxchar;
2555
2556 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2557 return -1;
2558
2559 if (width > length) {
2560 fill = width - length;
2561 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2562 return -1;
2563 writer->pos += fill;
2564 }
2565
2566 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2567 str, 0, length);
2568 writer->pos += length;
2569 return 0;
2570}
2571
2572static int
Victor Stinner998b8062018-09-12 00:23:25 +02002573unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002574 Py_ssize_t width, Py_ssize_t precision)
2575{
2576 /* UTF-8 */
2577 Py_ssize_t length;
2578 PyObject *unicode;
2579 int res;
2580
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002581 if (precision == -1) {
2582 length = strlen(str);
2583 }
2584 else {
2585 length = 0;
2586 while (length < precision && str[length]) {
2587 length++;
2588 }
2589 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002590 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2591 if (unicode == NULL)
2592 return -1;
2593
2594 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2595 Py_DECREF(unicode);
2596 return res;
2597}
2598
Victor Stinner96865452011-03-01 23:44:09 +00002599static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002600unicode_fromformat_arg(_PyUnicodeWriter *writer,
2601 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002602{
Victor Stinnere215d962012-10-06 23:03:36 +02002603 const char *p;
2604 Py_ssize_t len;
2605 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002606 Py_ssize_t width;
2607 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002608 int longflag;
2609 int longlongflag;
2610 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002611 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002612
2613 p = f;
2614 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002615 zeropad = 0;
2616 if (*f == '0') {
2617 zeropad = 1;
2618 f++;
2619 }
Victor Stinner96865452011-03-01 23:44:09 +00002620
2621 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002622 width = -1;
2623 if (Py_ISDIGIT((unsigned)*f)) {
2624 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002625 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002626 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002627 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002628 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002629 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002630 return NULL;
2631 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002632 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002633 f++;
2634 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002635 }
2636 precision = -1;
2637 if (*f == '.') {
2638 f++;
2639 if (Py_ISDIGIT((unsigned)*f)) {
2640 precision = (*f - '0');
2641 f++;
2642 while (Py_ISDIGIT((unsigned)*f)) {
2643 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2644 PyErr_SetString(PyExc_ValueError,
2645 "precision too big");
2646 return NULL;
2647 }
2648 precision = (precision * 10) + (*f - '0');
2649 f++;
2650 }
2651 }
Victor Stinner96865452011-03-01 23:44:09 +00002652 if (*f == '%') {
2653 /* "%.3%s" => f points to "3" */
2654 f--;
2655 }
2656 }
2657 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002658 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002659 f--;
2660 }
Victor Stinner96865452011-03-01 23:44:09 +00002661
2662 /* Handle %ld, %lu, %lld and %llu. */
2663 longflag = 0;
2664 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002665 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002666 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002667 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002668 longflag = 1;
2669 ++f;
2670 }
Victor Stinner96865452011-03-01 23:44:09 +00002671 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002672 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002673 longlongflag = 1;
2674 f += 2;
2675 }
Victor Stinner96865452011-03-01 23:44:09 +00002676 }
2677 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002678 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002679 size_tflag = 1;
2680 ++f;
2681 }
Victor Stinnere215d962012-10-06 23:03:36 +02002682
2683 if (f[1] == '\0')
2684 writer->overallocate = 0;
2685
2686 switch (*f) {
2687 case 'c':
2688 {
2689 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002690 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002691 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002692 "character argument not in range(0x110000)");
2693 return NULL;
2694 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002695 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002696 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002697 break;
2698 }
2699
2700 case 'i':
2701 case 'd':
2702 case 'u':
2703 case 'x':
2704 {
2705 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002706 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002707 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002708
2709 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002710 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002711 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002712 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002713 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002714 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002715 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002716 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002717 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002718 va_arg(*vargs, size_t));
2719 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002720 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002721 va_arg(*vargs, unsigned int));
2722 }
2723 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002724 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002725 }
2726 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002727 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002728 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002729 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002730 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002731 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002732 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002733 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002734 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002735 va_arg(*vargs, Py_ssize_t));
2736 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002737 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002738 va_arg(*vargs, int));
2739 }
2740 assert(len >= 0);
2741
Victor Stinnere215d962012-10-06 23:03:36 +02002742 if (precision < len)
2743 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002744
2745 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002746 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2747 return NULL;
2748
Victor Stinnere215d962012-10-06 23:03:36 +02002749 if (width > precision) {
2750 Py_UCS4 fillchar;
2751 fill = width - precision;
2752 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002753 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2754 return NULL;
2755 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002756 }
Victor Stinner15a11362012-10-06 23:48:20 +02002757 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002758 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002759 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2760 return NULL;
2761 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002762 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002763
Victor Stinner4a587072013-11-19 12:54:53 +01002764 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2765 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002766 break;
2767 }
2768
2769 case 'p':
2770 {
2771 char number[MAX_LONG_LONG_CHARS];
2772
2773 len = sprintf(number, "%p", va_arg(*vargs, void*));
2774 assert(len >= 0);
2775
2776 /* %p is ill-defined: ensure leading 0x. */
2777 if (number[1] == 'X')
2778 number[1] = 'x';
2779 else if (number[1] != 'x') {
2780 memmove(number + 2, number,
2781 strlen(number) + 1);
2782 number[0] = '0';
2783 number[1] = 'x';
2784 len += 2;
2785 }
2786
Victor Stinner4a587072013-11-19 12:54:53 +01002787 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002788 return NULL;
2789 break;
2790 }
2791
2792 case 's':
2793 {
2794 /* UTF-8 */
2795 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002796 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002797 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002798 break;
2799 }
2800
2801 case 'U':
2802 {
2803 PyObject *obj = va_arg(*vargs, PyObject *);
2804 assert(obj && _PyUnicode_CHECK(obj));
2805
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002806 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002807 return NULL;
2808 break;
2809 }
2810
2811 case 'V':
2812 {
2813 PyObject *obj = va_arg(*vargs, PyObject *);
2814 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002815 if (obj) {
2816 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002817 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002818 return NULL;
2819 }
2820 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002821 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002822 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002823 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002824 }
2825 break;
2826 }
2827
2828 case 'S':
2829 {
2830 PyObject *obj = va_arg(*vargs, PyObject *);
2831 PyObject *str;
2832 assert(obj);
2833 str = PyObject_Str(obj);
2834 if (!str)
2835 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002836 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002837 Py_DECREF(str);
2838 return NULL;
2839 }
2840 Py_DECREF(str);
2841 break;
2842 }
2843
2844 case 'R':
2845 {
2846 PyObject *obj = va_arg(*vargs, PyObject *);
2847 PyObject *repr;
2848 assert(obj);
2849 repr = PyObject_Repr(obj);
2850 if (!repr)
2851 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002852 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002853 Py_DECREF(repr);
2854 return NULL;
2855 }
2856 Py_DECREF(repr);
2857 break;
2858 }
2859
2860 case 'A':
2861 {
2862 PyObject *obj = va_arg(*vargs, PyObject *);
2863 PyObject *ascii;
2864 assert(obj);
2865 ascii = PyObject_ASCII(obj);
2866 if (!ascii)
2867 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002868 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002869 Py_DECREF(ascii);
2870 return NULL;
2871 }
2872 Py_DECREF(ascii);
2873 break;
2874 }
2875
2876 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002877 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002878 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002879 break;
2880
2881 default:
2882 /* if we stumble upon an unknown formatting code, copy the rest
2883 of the format string to the output string. (we cannot just
2884 skip the code, since there's no way to know what's in the
2885 argument list) */
2886 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002887 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002888 return NULL;
2889 f = p+len;
2890 return f;
2891 }
2892
2893 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002894 return f;
2895}
2896
Walter Dörwaldd2034312007-05-18 16:29:38 +00002897PyObject *
2898PyUnicode_FromFormatV(const char *format, va_list vargs)
2899{
Victor Stinnere215d962012-10-06 23:03:36 +02002900 va_list vargs2;
2901 const char *f;
2902 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002903
Victor Stinner8f674cc2013-04-17 23:02:17 +02002904 _PyUnicodeWriter_Init(&writer);
2905 writer.min_length = strlen(format) + 100;
2906 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002907
Benjamin Peterson0c212142016-09-20 20:39:33 -07002908 // Copy varags to be able to pass a reference to a subfunction.
2909 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002910
2911 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002912 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002913 f = unicode_fromformat_arg(&writer, f, &vargs2);
2914 if (f == NULL)
2915 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002916 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002917 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002918 const char *p;
2919 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002920
Victor Stinnere215d962012-10-06 23:03:36 +02002921 p = f;
2922 do
2923 {
2924 if ((unsigned char)*p > 127) {
2925 PyErr_Format(PyExc_ValueError,
2926 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2927 "string, got a non-ASCII byte: 0x%02x",
2928 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002929 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002930 }
2931 p++;
2932 }
2933 while (*p != '\0' && *p != '%');
2934 len = p - f;
2935
2936 if (*p == '\0')
2937 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002938
2939 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002940 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002941
2942 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002943 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002944 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002945 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002946 return _PyUnicodeWriter_Finish(&writer);
2947
2948 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002949 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002950 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002951 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002952}
2953
Walter Dörwaldd2034312007-05-18 16:29:38 +00002954PyObject *
2955PyUnicode_FromFormat(const char *format, ...)
2956{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002957 PyObject* ret;
2958 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002959
2960#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002961 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002962#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002963 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002964#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002965 ret = PyUnicode_FromFormatV(format, vargs);
2966 va_end(vargs);
2967 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002968}
2969
Serhiy Storchakac46db922018-10-23 22:58:24 +03002970static Py_ssize_t
2971unicode_get_widechar_size(PyObject *unicode)
2972{
2973 Py_ssize_t res;
2974
2975 assert(unicode != NULL);
2976 assert(_PyUnicode_CHECK(unicode));
2977
2978 if (_PyUnicode_WSTR(unicode) != NULL) {
2979 return PyUnicode_WSTR_LENGTH(unicode);
2980 }
2981 assert(PyUnicode_IS_READY(unicode));
2982
2983 res = _PyUnicode_LENGTH(unicode);
2984#if SIZEOF_WCHAR_T == 2
2985 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
2986 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
2987 const Py_UCS4 *end = s + res;
2988 for (; s < end; ++s) {
2989 if (*s > 0xFFFF) {
2990 ++res;
2991 }
2992 }
2993 }
2994#endif
2995 return res;
2996}
2997
2998static void
2999unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3000{
3001 const wchar_t *wstr;
3002
3003 assert(unicode != NULL);
3004 assert(_PyUnicode_CHECK(unicode));
3005
3006 wstr = _PyUnicode_WSTR(unicode);
3007 if (wstr != NULL) {
3008 memcpy(w, wstr, size * sizeof(wchar_t));
3009 return;
3010 }
3011 assert(PyUnicode_IS_READY(unicode));
3012
3013 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3014 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3015 for (; size--; ++s, ++w) {
3016 *w = *s;
3017 }
3018 }
3019 else {
3020#if SIZEOF_WCHAR_T == 4
3021 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3022 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3023 for (; size--; ++s, ++w) {
3024 *w = *s;
3025 }
3026#else
3027 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3028 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3029 for (; size--; ++s, ++w) {
3030 Py_UCS4 ch = *s;
3031 if (ch > 0xFFFF) {
3032 assert(ch <= MAX_UNICODE);
3033 /* encode surrogate pair in this case */
3034 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3035 if (!size--)
3036 break;
3037 *w = Py_UNICODE_LOW_SURROGATE(ch);
3038 }
3039 else {
3040 *w = ch;
3041 }
3042 }
3043#endif
3044 }
3045}
3046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003047#ifdef HAVE_WCHAR_H
3048
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003049/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003050
Victor Stinnerd88d9832011-09-06 02:00:05 +02003051 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003052 character) required to convert the unicode object. Ignore size argument.
3053
Victor Stinnerd88d9832011-09-06 02:00:05 +02003054 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003055 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003056 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003057Py_ssize_t
3058PyUnicode_AsWideChar(PyObject *unicode,
3059 wchar_t *w,
3060 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003061{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003062 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003063
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003064 if (unicode == NULL) {
3065 PyErr_BadInternalCall();
3066 return -1;
3067 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003068 if (!PyUnicode_Check(unicode)) {
3069 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003070 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003071 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003072
3073 res = unicode_get_widechar_size(unicode);
3074 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003075 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003076 }
3077
3078 if (size > res) {
3079 size = res + 1;
3080 }
3081 else {
3082 res = size;
3083 }
3084 unicode_copy_as_widechar(unicode, w, size);
3085 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003086}
3087
Victor Stinner137c34c2010-09-29 10:25:54 +00003088wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003089PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003090 Py_ssize_t *size)
3091{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003092 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003093 Py_ssize_t buflen;
3094
3095 if (unicode == NULL) {
3096 PyErr_BadInternalCall();
3097 return NULL;
3098 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003099 if (!PyUnicode_Check(unicode)) {
3100 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003101 return NULL;
3102 }
3103
Serhiy Storchakac46db922018-10-23 22:58:24 +03003104 buflen = unicode_get_widechar_size(unicode);
3105 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003106 if (buffer == NULL) {
3107 PyErr_NoMemory();
3108 return NULL;
3109 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003110 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3111 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003112 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003113 }
3114 else if (wcslen(buffer) != (size_t)buflen) {
3115 PyMem_FREE(buffer);
3116 PyErr_SetString(PyExc_ValueError,
3117 "embedded null character");
3118 return NULL;
3119 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003120 return buffer;
3121}
3122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003123#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124
Alexander Belopolsky40018472011-02-26 01:02:56 +00003125PyObject *
3126PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003127{
Victor Stinner8faf8212011-12-08 22:14:11 +01003128 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003129 PyErr_SetString(PyExc_ValueError,
3130 "chr() arg not in range(0x110000)");
3131 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003132 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003133
Victor Stinner985a82a2014-01-03 12:53:47 +01003134 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003135}
3136
Alexander Belopolsky40018472011-02-26 01:02:56 +00003137PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003138PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003139{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003140 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003141 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003142 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003143 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003144 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003145 Py_INCREF(obj);
3146 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003147 }
3148 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003149 /* For a Unicode subtype that's not a Unicode object,
3150 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003151 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003152 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003153 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003154 "Can't convert '%.100s' object to str implicitly",
3155 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003156 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003157}
3158
Alexander Belopolsky40018472011-02-26 01:02:56 +00003159PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003160PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003161 const char *encoding,
3162 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003163{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003164 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003165 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003166
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003168 PyErr_BadInternalCall();
3169 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003170 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003171
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003172 /* Decoding bytes objects is the most common case and should be fast */
3173 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003174 if (PyBytes_GET_SIZE(obj) == 0)
3175 _Py_RETURN_UNICODE_EMPTY();
3176 v = PyUnicode_Decode(
3177 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3178 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003179 return v;
3180 }
3181
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003182 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003183 PyErr_SetString(PyExc_TypeError,
3184 "decoding str is not supported");
3185 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003186 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003187
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003188 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3189 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3190 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003191 "decoding to str: need a bytes-like object, %.80s found",
3192 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003193 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003194 }
Tim Petersced69f82003-09-16 20:30:58 +00003195
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003196 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003197 PyBuffer_Release(&buffer);
3198 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003200
Serhiy Storchaka05997252013-01-26 12:14:02 +02003201 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003202 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003203 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003204}
3205
Victor Stinnerebe17e02016-10-12 13:57:45 +02003206/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3207 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3208 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003209int
3210_Py_normalize_encoding(const char *encoding,
3211 char *lower,
3212 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003214 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003215 char *l;
3216 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003217 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003218
Victor Stinner942889a2016-09-05 15:40:10 -07003219 assert(encoding != NULL);
3220
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003221 e = encoding;
3222 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003223 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003224 punct = 0;
3225 while (1) {
3226 char c = *e;
3227 if (c == 0) {
3228 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003229 }
Victor Stinner942889a2016-09-05 15:40:10 -07003230
3231 if (Py_ISALNUM(c) || c == '.') {
3232 if (punct && l != lower) {
3233 if (l == l_end) {
3234 return 0;
3235 }
3236 *l++ = '_';
3237 }
3238 punct = 0;
3239
3240 if (l == l_end) {
3241 return 0;
3242 }
3243 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003244 }
3245 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003246 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003247 }
Victor Stinner942889a2016-09-05 15:40:10 -07003248
3249 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003250 }
3251 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003252 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003253}
3254
Alexander Belopolsky40018472011-02-26 01:02:56 +00003255PyObject *
3256PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003257 Py_ssize_t size,
3258 const char *encoding,
3259 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003260{
3261 PyObject *buffer = NULL, *unicode;
3262 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003263 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3264
3265 if (encoding == NULL) {
3266 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3267 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003268
Fred Drakee4315f52000-05-09 19:53:39 +00003269 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003270 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3271 char *lower = buflower;
3272
3273 /* Fast paths */
3274 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3275 lower += 3;
3276 if (*lower == '_') {
3277 /* Match "utf8" and "utf_8" */
3278 lower++;
3279 }
3280
3281 if (lower[0] == '8' && lower[1] == 0) {
3282 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3283 }
3284 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3285 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3286 }
3287 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3288 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3289 }
3290 }
3291 else {
3292 if (strcmp(lower, "ascii") == 0
3293 || strcmp(lower, "us_ascii") == 0) {
3294 return PyUnicode_DecodeASCII(s, size, errors);
3295 }
Steve Dowercc16be82016-09-08 10:35:16 -07003296 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003297 else if (strcmp(lower, "mbcs") == 0) {
3298 return PyUnicode_DecodeMBCS(s, size, errors);
3299 }
3300 #endif
3301 else if (strcmp(lower, "latin1") == 0
3302 || strcmp(lower, "latin_1") == 0
3303 || strcmp(lower, "iso_8859_1") == 0
3304 || strcmp(lower, "iso8859_1") == 0) {
3305 return PyUnicode_DecodeLatin1(s, size, errors);
3306 }
3307 }
Victor Stinner37296e82010-06-10 13:36:23 +00003308 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309
3310 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003311 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003312 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003313 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003314 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315 if (buffer == NULL)
3316 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003317 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318 if (unicode == NULL)
3319 goto onError;
3320 if (!PyUnicode_Check(unicode)) {
3321 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003322 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003323 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003324 encoding,
3325 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003326 Py_DECREF(unicode);
3327 goto onError;
3328 }
3329 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003330 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003331
Benjamin Peterson29060642009-01-31 22:14:21 +00003332 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003333 Py_XDECREF(buffer);
3334 return NULL;
3335}
3336
Alexander Belopolsky40018472011-02-26 01:02:56 +00003337PyObject *
3338PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003339 const char *encoding,
3340 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003341{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003342 if (!PyUnicode_Check(unicode)) {
3343 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003344 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003345 }
3346
Serhiy Storchaka00939072016-10-27 21:05:49 +03003347 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3348 "PyUnicode_AsDecodedObject() is deprecated; "
3349 "use PyCodec_Decode() to decode from str", 1) < 0)
3350 return NULL;
3351
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003352 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003353 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003354
3355 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003356 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003357}
3358
Alexander Belopolsky40018472011-02-26 01:02:56 +00003359PyObject *
3360PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003361 const char *encoding,
3362 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003363{
3364 PyObject *v;
3365
3366 if (!PyUnicode_Check(unicode)) {
3367 PyErr_BadArgument();
3368 goto onError;
3369 }
3370
Serhiy Storchaka00939072016-10-27 21:05:49 +03003371 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3372 "PyUnicode_AsDecodedUnicode() is deprecated; "
3373 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3374 return NULL;
3375
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003376 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003377 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003378
3379 /* Decode via the codec registry */
3380 v = PyCodec_Decode(unicode, encoding, errors);
3381 if (v == NULL)
3382 goto onError;
3383 if (!PyUnicode_Check(v)) {
3384 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003385 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003386 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003387 encoding,
3388 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003389 Py_DECREF(v);
3390 goto onError;
3391 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003392 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003393
Benjamin Peterson29060642009-01-31 22:14:21 +00003394 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003395 return NULL;
3396}
3397
Alexander Belopolsky40018472011-02-26 01:02:56 +00003398PyObject *
3399PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003400 Py_ssize_t size,
3401 const char *encoding,
3402 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403{
3404 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003405
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003406 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003407 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003408 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003409 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3410 Py_DECREF(unicode);
3411 return v;
3412}
3413
Alexander Belopolsky40018472011-02-26 01:02:56 +00003414PyObject *
3415PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003416 const char *encoding,
3417 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003418{
3419 PyObject *v;
3420
3421 if (!PyUnicode_Check(unicode)) {
3422 PyErr_BadArgument();
3423 goto onError;
3424 }
3425
Serhiy Storchaka00939072016-10-27 21:05:49 +03003426 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3427 "PyUnicode_AsEncodedObject() is deprecated; "
3428 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3429 "or PyCodec_Encode() for generic encoding", 1) < 0)
3430 return NULL;
3431
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003432 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003433 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003434
3435 /* Encode via the codec registry */
3436 v = PyCodec_Encode(unicode, encoding, errors);
3437 if (v == NULL)
3438 goto onError;
3439 return v;
3440
Benjamin Peterson29060642009-01-31 22:14:21 +00003441 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003442 return NULL;
3443}
3444
Victor Stinner1b579672011-12-17 05:47:23 +01003445
Victor Stinner2cba6b82018-01-10 22:46:15 +01003446static PyObject *
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003447unicode_encode_locale(PyObject *unicode, const char *errors,
3448 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003449{
Victor Stinner3d4226a2018-08-29 22:21:32 +02003450 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003451
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003452 Py_ssize_t wlen;
3453 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3454 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003455 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003456 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003457
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003458 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003459 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003460 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003461 return NULL;
3462 }
3463
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003464 char *str;
3465 size_t error_pos;
3466 const char *reason;
3467 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003468 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003469 PyMem_Free(wstr);
3470
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003471 if (res != 0) {
3472 if (res == -2) {
3473 PyObject *exc;
3474 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3475 "locale", unicode,
3476 (Py_ssize_t)error_pos,
3477 (Py_ssize_t)(error_pos+1),
3478 reason);
3479 if (exc != NULL) {
3480 PyCodec_StrictErrors(exc);
3481 Py_DECREF(exc);
3482 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003483 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003484 else if (res == -3) {
3485 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3486 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003487 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003488 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003489 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003490 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003491 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003492
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003493 PyObject *bytes = PyBytes_FromString(str);
3494 PyMem_RawFree(str);
3495 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003496}
3497
Victor Stinnerad158722010-10-27 00:25:46 +00003498PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003499PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3500{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003501 return unicode_encode_locale(unicode, errors, 1);
3502}
3503
3504PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003505PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003506{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003507 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003508 const _PyCoreConfig *config = &interp->core_config;
Victor Stinnere2510952019-05-02 11:28:57 -04003509#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003510 return _PyUnicode_AsUTF8String(unicode, config->filesystem_errors);
3511#else
Victor Stinner793b5312011-04-27 00:24:21 +02003512 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3513 cannot use it to encode and decode filenames before it is loaded. Load
3514 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003515 implementation of the locale codec until the codec registry is
3516 initialized and the Python codec is loaded. See initfsencoding(). */
3517 if (interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003518 return PyUnicode_AsEncodedString(unicode,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003519 config->filesystem_encoding,
3520 config->filesystem_errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003521 }
3522 else {
Victor Stinner2cba6b82018-01-10 22:46:15 +01003523 return unicode_encode_locale(unicode,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003524 config->filesystem_errors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003525 }
Victor Stinnerad158722010-10-27 00:25:46 +00003526#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003527}
3528
Alexander Belopolsky40018472011-02-26 01:02:56 +00003529PyObject *
3530PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003531 const char *encoding,
3532 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003533{
3534 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003535 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003536
Guido van Rossumd57fd912000-03-10 22:53:23 +00003537 if (!PyUnicode_Check(unicode)) {
3538 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003539 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003540 }
Fred Drakee4315f52000-05-09 19:53:39 +00003541
Victor Stinner942889a2016-09-05 15:40:10 -07003542 if (encoding == NULL) {
3543 return _PyUnicode_AsUTF8String(unicode, errors);
3544 }
3545
Fred Drakee4315f52000-05-09 19:53:39 +00003546 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003547 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3548 char *lower = buflower;
3549
3550 /* Fast paths */
3551 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3552 lower += 3;
3553 if (*lower == '_') {
3554 /* Match "utf8" and "utf_8" */
3555 lower++;
3556 }
3557
3558 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003559 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003560 }
3561 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3562 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3563 }
3564 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3565 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3566 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003567 }
Victor Stinner942889a2016-09-05 15:40:10 -07003568 else {
3569 if (strcmp(lower, "ascii") == 0
3570 || strcmp(lower, "us_ascii") == 0) {
3571 return _PyUnicode_AsASCIIString(unicode, errors);
3572 }
Steve Dowercc16be82016-09-08 10:35:16 -07003573#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003574 else if (strcmp(lower, "mbcs") == 0) {
3575 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3576 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003577#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003578 else if (strcmp(lower, "latin1") == 0 ||
3579 strcmp(lower, "latin_1") == 0 ||
3580 strcmp(lower, "iso_8859_1") == 0 ||
3581 strcmp(lower, "iso8859_1") == 0) {
3582 return _PyUnicode_AsLatin1String(unicode, errors);
3583 }
3584 }
Victor Stinner37296e82010-06-10 13:36:23 +00003585 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003586
3587 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003588 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003589 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003590 return NULL;
3591
3592 /* The normal path */
3593 if (PyBytes_Check(v))
3594 return v;
3595
3596 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003597 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003598 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003599 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003600
3601 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003602 "encoder %s returned bytearray instead of bytes; "
3603 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003604 encoding);
3605 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003606 Py_DECREF(v);
3607 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003608 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003609
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003610 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3611 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003612 Py_DECREF(v);
3613 return b;
3614 }
3615
3616 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003617 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003618 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003619 encoding,
3620 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003621 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003622 return NULL;
3623}
3624
Alexander Belopolsky40018472011-02-26 01:02:56 +00003625PyObject *
3626PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003627 const char *encoding,
3628 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003629{
3630 PyObject *v;
3631
3632 if (!PyUnicode_Check(unicode)) {
3633 PyErr_BadArgument();
3634 goto onError;
3635 }
3636
Serhiy Storchaka00939072016-10-27 21:05:49 +03003637 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3638 "PyUnicode_AsEncodedUnicode() is deprecated; "
3639 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3640 return NULL;
3641
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003642 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003643 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003644
3645 /* Encode via the codec registry */
3646 v = PyCodec_Encode(unicode, encoding, errors);
3647 if (v == NULL)
3648 goto onError;
3649 if (!PyUnicode_Check(v)) {
3650 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003651 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003652 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003653 encoding,
3654 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003655 Py_DECREF(v);
3656 goto onError;
3657 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003658 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003659
Benjamin Peterson29060642009-01-31 22:14:21 +00003660 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003661 return NULL;
3662}
3663
Victor Stinner2cba6b82018-01-10 22:46:15 +01003664static PyObject*
3665unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
3666 int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003667{
Victor Stinner3d4226a2018-08-29 22:21:32 +02003668 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003669
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003670 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3671 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003672 return NULL;
3673 }
3674
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003675 wchar_t *wstr;
3676 size_t wlen;
3677 const char *reason;
3678 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003679 current_locale, error_handler);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003680 if (res != 0) {
3681 if (res == -2) {
3682 PyObject *exc;
3683 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3684 "locale", str, len,
3685 (Py_ssize_t)wlen,
3686 (Py_ssize_t)(wlen + 1),
3687 reason);
3688 if (exc != NULL) {
3689 PyCodec_StrictErrors(exc);
3690 Py_DECREF(exc);
3691 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003692 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003693 else if (res == -3) {
3694 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3695 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003696 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003697 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003698 }
Victor Stinner2f197072011-12-17 07:08:30 +01003699 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003700 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003701
3702 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3703 PyMem_RawFree(wstr);
3704 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003705}
3706
3707PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003708PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3709 const char *errors)
3710{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003711 return unicode_decode_locale(str, len, errors, 1);
3712}
3713
3714PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003715PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003716{
3717 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003718 return unicode_decode_locale(str, size, errors, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003719}
3720
3721
3722PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003723PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003724 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003725 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3726}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003727
Christian Heimes5894ba72007-11-04 11:43:14 +00003728PyObject*
3729PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3730{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003731 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003732 const _PyCoreConfig *config = &interp->core_config;
Victor Stinnere2510952019-05-02 11:28:57 -04003733#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003734 return PyUnicode_DecodeUTF8Stateful(s, size, config->filesystem_errors, NULL);
3735#else
Victor Stinner793b5312011-04-27 00:24:21 +02003736 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3737 cannot use it to encode and decode filenames before it is loaded. Load
3738 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003739 implementation of the locale codec until the codec registry is
3740 initialized and the Python codec is loaded. See initfsencoding(). */
3741 if (interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003742 return PyUnicode_Decode(s, size,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003743 config->filesystem_encoding,
3744 config->filesystem_errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003745 }
3746 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003747 return unicode_decode_locale(s, size,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003748 config->filesystem_errors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003749 }
Victor Stinnerad158722010-10-27 00:25:46 +00003750#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003751}
3752
Martin v. Löwis011e8422009-05-05 04:43:17 +00003753
3754int
3755PyUnicode_FSConverter(PyObject* arg, void* addr)
3756{
Brett Cannonec6ce872016-09-06 15:50:29 -07003757 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003758 PyObject *output = NULL;
3759 Py_ssize_t size;
3760 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003761 if (arg == NULL) {
3762 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003763 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003764 return 1;
3765 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003766 path = PyOS_FSPath(arg);
3767 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003768 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003769 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003770 if (PyBytes_Check(path)) {
3771 output = path;
3772 }
3773 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3774 output = PyUnicode_EncodeFSDefault(path);
3775 Py_DECREF(path);
3776 if (!output) {
3777 return 0;
3778 }
3779 assert(PyBytes_Check(output));
3780 }
3781
Victor Stinner0ea2a462010-04-30 00:22:08 +00003782 size = PyBytes_GET_SIZE(output);
3783 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003784 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003785 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003786 Py_DECREF(output);
3787 return 0;
3788 }
3789 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003790 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003791}
3792
3793
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003794int
3795PyUnicode_FSDecoder(PyObject* arg, void* addr)
3796{
Brett Cannona5711202016-09-06 19:36:01 -07003797 int is_buffer = 0;
3798 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003799 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003800 if (arg == NULL) {
3801 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003802 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003803 return 1;
3804 }
Brett Cannona5711202016-09-06 19:36:01 -07003805
3806 is_buffer = PyObject_CheckBuffer(arg);
3807 if (!is_buffer) {
3808 path = PyOS_FSPath(arg);
3809 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003810 return 0;
3811 }
Brett Cannona5711202016-09-06 19:36:01 -07003812 }
3813 else {
3814 path = arg;
3815 Py_INCREF(arg);
3816 }
3817
3818 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003819 output = path;
3820 }
3821 else if (PyBytes_Check(path) || is_buffer) {
3822 PyObject *path_bytes = NULL;
3823
3824 if (!PyBytes_Check(path) &&
3825 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003826 "path should be string, bytes, or os.PathLike, not %.200s",
3827 Py_TYPE(arg)->tp_name)) {
3828 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003829 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003830 }
3831 path_bytes = PyBytes_FromObject(path);
3832 Py_DECREF(path);
3833 if (!path_bytes) {
3834 return 0;
3835 }
3836 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3837 PyBytes_GET_SIZE(path_bytes));
3838 Py_DECREF(path_bytes);
3839 if (!output) {
3840 return 0;
3841 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003842 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003843 else {
3844 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003845 "path should be string, bytes, or os.PathLike, not %.200s",
3846 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003847 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003848 return 0;
3849 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003850 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003851 Py_DECREF(output);
3852 return 0;
3853 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003854 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003855 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003856 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003857 Py_DECREF(output);
3858 return 0;
3859 }
3860 *(PyObject**)addr = output;
3861 return Py_CLEANUP_SUPPORTED;
3862}
3863
3864
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003865const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003866PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003867{
Christian Heimesf3863112007-11-22 07:46:41 +00003868 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003869
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003870 if (!PyUnicode_Check(unicode)) {
3871 PyErr_BadArgument();
3872 return NULL;
3873 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003874 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003875 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003876
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003877 if (PyUnicode_UTF8(unicode) == NULL) {
3878 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003879 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003880 if (bytes == NULL)
3881 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003882 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3883 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003884 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003885 Py_DECREF(bytes);
3886 return NULL;
3887 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003888 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003889 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003890 PyBytes_AS_STRING(bytes),
3891 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003892 Py_DECREF(bytes);
3893 }
3894
3895 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003896 *psize = PyUnicode_UTF8_LENGTH(unicode);
3897 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003898}
3899
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003900const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003901PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003902{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003903 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3904}
3905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003906Py_UNICODE *
3907PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3908{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003909 if (!PyUnicode_Check(unicode)) {
3910 PyErr_BadArgument();
3911 return NULL;
3912 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003913 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
3914 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003915 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003916 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003917 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003918
Serhiy Storchakac46db922018-10-23 22:58:24 +03003919 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
3920 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3921 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003922 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003923 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003924 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
3925 if (w == NULL) {
3926 PyErr_NoMemory();
3927 return NULL;
3928 }
3929 unicode_copy_as_widechar(unicode, w, wlen + 1);
3930 _PyUnicode_WSTR(unicode) = w;
3931 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
3932 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003933 }
3934 }
3935 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003936 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003937 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00003938}
3939
Alexander Belopolsky40018472011-02-26 01:02:56 +00003940Py_UNICODE *
3941PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003942{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003943 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003944}
3945
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03003946const Py_UNICODE *
3947_PyUnicode_AsUnicode(PyObject *unicode)
3948{
3949 Py_ssize_t size;
3950 const Py_UNICODE *wstr;
3951
3952 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
3953 if (wstr && wcslen(wstr) != (size_t)size) {
3954 PyErr_SetString(PyExc_ValueError, "embedded null character");
3955 return NULL;
3956 }
3957 return wstr;
3958}
3959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003960
Alexander Belopolsky40018472011-02-26 01:02:56 +00003961Py_ssize_t
3962PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003963{
3964 if (!PyUnicode_Check(unicode)) {
3965 PyErr_BadArgument();
3966 goto onError;
3967 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003968 if (_PyUnicode_WSTR(unicode) == NULL) {
3969 if (PyUnicode_AsUnicode(unicode) == NULL)
3970 goto onError;
3971 }
3972 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003973
Benjamin Peterson29060642009-01-31 22:14:21 +00003974 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975 return -1;
3976}
3977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003978Py_ssize_t
3979PyUnicode_GetLength(PyObject *unicode)
3980{
Victor Stinner07621332012-06-16 04:53:46 +02003981 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003982 PyErr_BadArgument();
3983 return -1;
3984 }
Victor Stinner07621332012-06-16 04:53:46 +02003985 if (PyUnicode_READY(unicode) == -1)
3986 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003987 return PyUnicode_GET_LENGTH(unicode);
3988}
3989
3990Py_UCS4
3991PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3992{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003993 void *data;
3994 int kind;
3995
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03003996 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003997 PyErr_BadArgument();
3998 return (Py_UCS4)-1;
3999 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004000 if (PyUnicode_READY(unicode) == -1) {
4001 return (Py_UCS4)-1;
4002 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004003 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004004 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004005 return (Py_UCS4)-1;
4006 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004007 data = PyUnicode_DATA(unicode);
4008 kind = PyUnicode_KIND(unicode);
4009 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004010}
4011
4012int
4013PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4014{
4015 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004016 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004017 return -1;
4018 }
Victor Stinner488fa492011-12-12 00:01:39 +01004019 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004020 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004021 PyErr_SetString(PyExc_IndexError, "string index out of range");
4022 return -1;
4023 }
Victor Stinner488fa492011-12-12 00:01:39 +01004024 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004025 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004026 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4027 PyErr_SetString(PyExc_ValueError, "character out of range");
4028 return -1;
4029 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004030 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4031 index, ch);
4032 return 0;
4033}
4034
Alexander Belopolsky40018472011-02-26 01:02:56 +00004035const char *
4036PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004037{
Victor Stinner42cb4622010-09-01 19:39:01 +00004038 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004039}
4040
Victor Stinner554f3f02010-06-16 23:33:54 +00004041/* create or adjust a UnicodeDecodeError */
4042static void
4043make_decode_exception(PyObject **exceptionObject,
4044 const char *encoding,
4045 const char *input, Py_ssize_t length,
4046 Py_ssize_t startpos, Py_ssize_t endpos,
4047 const char *reason)
4048{
4049 if (*exceptionObject == NULL) {
4050 *exceptionObject = PyUnicodeDecodeError_Create(
4051 encoding, input, length, startpos, endpos, reason);
4052 }
4053 else {
4054 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4055 goto onError;
4056 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4057 goto onError;
4058 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4059 goto onError;
4060 }
4061 return;
4062
4063onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004064 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004065}
4066
Steve Dowercc16be82016-09-08 10:35:16 -07004067#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004068static int
4069widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4070{
4071 if (newsize > *size) {
4072 wchar_t *newbuf = *buf;
4073 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4074 PyErr_NoMemory();
4075 return -1;
4076 }
4077 *buf = newbuf;
4078 }
4079 *size = newsize;
4080 return 0;
4081}
4082
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004083/* error handling callback helper:
4084 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004085 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004086 and adjust various state variables.
4087 return 0 on success, -1 on error
4088*/
4089
Alexander Belopolsky40018472011-02-26 01:02:56 +00004090static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004091unicode_decode_call_errorhandler_wchar(
4092 const char *errors, PyObject **errorHandler,
4093 const char *encoding, const char *reason,
4094 const char **input, const char **inend, Py_ssize_t *startinpos,
4095 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004096 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004097{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004098 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004099
4100 PyObject *restuple = NULL;
4101 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004102 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004103 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004104 Py_ssize_t requiredsize;
4105 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004106 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004107 wchar_t *repwstr;
4108 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004109
4110 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004111 *errorHandler = PyCodec_LookupError(errors);
4112 if (*errorHandler == NULL)
4113 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004114 }
4115
Victor Stinner554f3f02010-06-16 23:33:54 +00004116 make_decode_exception(exceptionObject,
4117 encoding,
4118 *input, *inend - *input,
4119 *startinpos, *endinpos,
4120 reason);
4121 if (*exceptionObject == NULL)
4122 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004124 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004125 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004126 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004127 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004128 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004129 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004130 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004131 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004132 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004133
4134 /* Copy back the bytes variables, which might have been modified by the
4135 callback */
4136 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4137 if (!inputobj)
4138 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004139 *input = PyBytes_AS_STRING(inputobj);
4140 insize = PyBytes_GET_SIZE(inputobj);
4141 *inend = *input + insize;
4142 /* we can DECREF safely, as the exception has another reference,
4143 so the object won't go away. */
4144 Py_DECREF(inputobj);
4145
4146 if (newpos<0)
4147 newpos = insize+newpos;
4148 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004149 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004150 goto onError;
4151 }
4152
4153 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4154 if (repwstr == NULL)
4155 goto onError;
4156 /* need more space? (at least enough for what we
4157 have+the replacement+the rest of the string (starting
4158 at the new input position), so we won't have to check space
4159 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004160 requiredsize = *outpos;
4161 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4162 goto overflow;
4163 requiredsize += repwlen;
4164 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4165 goto overflow;
4166 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004167 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004168 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004169 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004170 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004171 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004172 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004173 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004174 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004175 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004176 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004177 *endinpos = newpos;
4178 *inptr = *input + newpos;
4179
4180 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004181 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004182 return 0;
4183
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004184 overflow:
4185 PyErr_SetString(PyExc_OverflowError,
4186 "decoded result is too long for a Python string");
4187
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004188 onError:
4189 Py_XDECREF(restuple);
4190 return -1;
4191}
Steve Dowercc16be82016-09-08 10:35:16 -07004192#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004193
4194static int
4195unicode_decode_call_errorhandler_writer(
4196 const char *errors, PyObject **errorHandler,
4197 const char *encoding, const char *reason,
4198 const char **input, const char **inend, Py_ssize_t *startinpos,
4199 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4200 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4201{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004202 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004203
4204 PyObject *restuple = NULL;
4205 PyObject *repunicode = NULL;
4206 Py_ssize_t insize;
4207 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004208 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004209 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004210 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004211 int need_to_grow = 0;
4212 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004213
4214 if (*errorHandler == NULL) {
4215 *errorHandler = PyCodec_LookupError(errors);
4216 if (*errorHandler == NULL)
4217 goto onError;
4218 }
4219
4220 make_decode_exception(exceptionObject,
4221 encoding,
4222 *input, *inend - *input,
4223 *startinpos, *endinpos,
4224 reason);
4225 if (*exceptionObject == NULL)
4226 goto onError;
4227
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004228 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004229 if (restuple == NULL)
4230 goto onError;
4231 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004232 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004233 goto onError;
4234 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004235 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004236 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004237
4238 /* Copy back the bytes variables, which might have been modified by the
4239 callback */
4240 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4241 if (!inputobj)
4242 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004243 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004244 *input = PyBytes_AS_STRING(inputobj);
4245 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004246 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004247 /* we can DECREF safely, as the exception has another reference,
4248 so the object won't go away. */
4249 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004250
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004251 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004252 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004253 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004254 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004255 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004256 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004257
Victor Stinner170ca6f2013-04-18 00:25:28 +02004258 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004259 if (replen > 1) {
4260 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004261 need_to_grow = 1;
4262 }
4263 new_inptr = *input + newpos;
4264 if (*inend - new_inptr > remain) {
4265 /* We don't know the decoding algorithm here so we make the worst
4266 assumption that one byte decodes to one unicode character.
4267 If unfortunately one byte could decode to more unicode characters,
4268 the decoder may write out-of-bound then. Is it possible for the
4269 algorithms using this function? */
4270 writer->min_length += *inend - new_inptr - remain;
4271 need_to_grow = 1;
4272 }
4273 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004274 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004275 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004276 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4277 goto onError;
4278 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004279 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004280 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004281
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004283 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004284
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004285 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004286 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004287 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004288
Benjamin Peterson29060642009-01-31 22:14:21 +00004289 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004290 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004291 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004292}
4293
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004294/* --- UTF-7 Codec -------------------------------------------------------- */
4295
Antoine Pitrou244651a2009-05-04 18:56:13 +00004296/* See RFC2152 for details. We encode conservatively and decode liberally. */
4297
4298/* Three simple macros defining base-64. */
4299
4300/* Is c a base-64 character? */
4301
4302#define IS_BASE64(c) \
4303 (((c) >= 'A' && (c) <= 'Z') || \
4304 ((c) >= 'a' && (c) <= 'z') || \
4305 ((c) >= '0' && (c) <= '9') || \
4306 (c) == '+' || (c) == '/')
4307
4308/* given that c is a base-64 character, what is its base-64 value? */
4309
4310#define FROM_BASE64(c) \
4311 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4312 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4313 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4314 (c) == '+' ? 62 : 63)
4315
4316/* What is the base-64 character of the bottom 6 bits of n? */
4317
4318#define TO_BASE64(n) \
4319 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4320
4321/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4322 * decoded as itself. We are permissive on decoding; the only ASCII
4323 * byte not decoding to itself is the + which begins a base64
4324 * string. */
4325
4326#define DECODE_DIRECT(c) \
4327 ((c) <= 127 && (c) != '+')
4328
4329/* The UTF-7 encoder treats ASCII characters differently according to
4330 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4331 * the above). See RFC2152. This array identifies these different
4332 * sets:
4333 * 0 : "Set D"
4334 * alphanumeric and '(),-./:?
4335 * 1 : "Set O"
4336 * !"#$%&*;<=>@[]^_`{|}
4337 * 2 : "whitespace"
4338 * ht nl cr sp
4339 * 3 : special (must be base64 encoded)
4340 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4341 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004342
Tim Petersced69f82003-09-16 20:30:58 +00004343static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004344char utf7_category[128] = {
4345/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4346 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4347/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4348 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4349/* sp ! " # $ % & ' ( ) * + , - . / */
4350 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4351/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4352 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4353/* @ A B C D E F G H I J K L M N O */
4354 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4355/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4356 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4357/* ` a b c d e f g h i j k l m n o */
4358 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4359/* p q r s t u v w x y z { | } ~ del */
4360 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004361};
4362
Antoine Pitrou244651a2009-05-04 18:56:13 +00004363/* ENCODE_DIRECT: this character should be encoded as itself. The
4364 * answer depends on whether we are encoding set O as itself, and also
4365 * on whether we are encoding whitespace as itself. RFC2152 makes it
4366 * clear that the answers to these questions vary between
4367 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004368
Antoine Pitrou244651a2009-05-04 18:56:13 +00004369#define ENCODE_DIRECT(c, directO, directWS) \
4370 ((c) < 128 && (c) > 0 && \
4371 ((utf7_category[(c)] == 0) || \
4372 (directWS && (utf7_category[(c)] == 2)) || \
4373 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004374
Alexander Belopolsky40018472011-02-26 01:02:56 +00004375PyObject *
4376PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004377 Py_ssize_t size,
4378 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004379{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004380 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4381}
4382
Antoine Pitrou244651a2009-05-04 18:56:13 +00004383/* The decoder. The only state we preserve is our read position,
4384 * i.e. how many characters we have consumed. So if we end in the
4385 * middle of a shift sequence we have to back off the read position
4386 * and the output to the beginning of the sequence, otherwise we lose
4387 * all the shift state (seen bits, number of bits seen, high
4388 * surrogate). */
4389
Alexander Belopolsky40018472011-02-26 01:02:56 +00004390PyObject *
4391PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004392 Py_ssize_t size,
4393 const char *errors,
4394 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004395{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004396 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004397 Py_ssize_t startinpos;
4398 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004399 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004400 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004401 const char *errmsg = "";
4402 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004403 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004404 unsigned int base64bits = 0;
4405 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004406 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 PyObject *errorHandler = NULL;
4408 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004409
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004410 if (size == 0) {
4411 if (consumed)
4412 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004413 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004414 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004415
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004416 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004417 _PyUnicodeWriter_Init(&writer);
4418 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004419
4420 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004421 e = s + size;
4422
4423 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004424 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004425 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004426 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004427
Antoine Pitrou244651a2009-05-04 18:56:13 +00004428 if (inShift) { /* in a base-64 section */
4429 if (IS_BASE64(ch)) { /* consume a base-64 character */
4430 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4431 base64bits += 6;
4432 s++;
4433 if (base64bits >= 16) {
4434 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004435 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004436 base64bits -= 16;
4437 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004438 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004439 if (surrogate) {
4440 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004441 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4442 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004443 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004444 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004445 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004446 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004447 }
4448 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004449 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004450 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004451 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004452 }
4453 }
Victor Stinner551ac952011-11-29 22:58:13 +01004454 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004455 /* first surrogate */
4456 surrogate = outCh;
4457 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004458 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004459 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004460 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004461 }
4462 }
4463 }
4464 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004465 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004466 if (base64bits > 0) { /* left-over bits */
4467 if (base64bits >= 6) {
4468 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004469 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004470 errmsg = "partial character in shift sequence";
4471 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004472 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004473 else {
4474 /* Some bits remain; they should be zero */
4475 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004476 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004477 errmsg = "non-zero padding bits in shift sequence";
4478 goto utf7Error;
4479 }
4480 }
4481 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004482 if (surrogate && DECODE_DIRECT(ch)) {
4483 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4484 goto onError;
4485 }
4486 surrogate = 0;
4487 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004488 /* '-' is absorbed; other terminating
4489 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004490 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004491 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004492 }
4493 }
4494 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004496 s++; /* consume '+' */
4497 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004498 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004499 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004500 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004501 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004502 else if (s < e && !IS_BASE64(*s)) {
4503 s++;
4504 errmsg = "ill-formed sequence";
4505 goto utf7Error;
4506 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004507 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004508 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004509 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004510 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004511 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004512 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004513 }
4514 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004515 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004516 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004517 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004518 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004519 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004520 else {
4521 startinpos = s-starts;
4522 s++;
4523 errmsg = "unexpected special character";
4524 goto utf7Error;
4525 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004526 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004527utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004528 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004529 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004530 errors, &errorHandler,
4531 "utf7", errmsg,
4532 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004533 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004534 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535 }
4536
Antoine Pitrou244651a2009-05-04 18:56:13 +00004537 /* end of string */
4538
4539 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4540 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004541 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004542 if (surrogate ||
4543 (base64bits >= 6) ||
4544 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004545 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004546 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004547 errors, &errorHandler,
4548 "utf7", "unterminated shift sequence",
4549 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004550 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004551 goto onError;
4552 if (s < e)
4553 goto restart;
4554 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004555 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004556
4557 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004558 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004559 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004560 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004561 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004562 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004563 writer.kind, writer.data, shiftOutStart);
4564 Py_XDECREF(errorHandler);
4565 Py_XDECREF(exc);
4566 _PyUnicodeWriter_Dealloc(&writer);
4567 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004568 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004569 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004570 }
4571 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004572 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004574 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004575
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576 Py_XDECREF(errorHandler);
4577 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004578 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004579
Benjamin Peterson29060642009-01-31 22:14:21 +00004580 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004581 Py_XDECREF(errorHandler);
4582 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004583 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004584 return NULL;
4585}
4586
4587
Alexander Belopolsky40018472011-02-26 01:02:56 +00004588PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004589_PyUnicode_EncodeUTF7(PyObject *str,
4590 int base64SetO,
4591 int base64WhiteSpace,
4592 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004593{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004594 int kind;
4595 void *data;
4596 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004597 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004598 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004599 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600 unsigned int base64bits = 0;
4601 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004602 char * out;
4603 char * start;
4604
Benjamin Petersonbac79492012-01-14 13:34:47 -05004605 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004606 return NULL;
4607 kind = PyUnicode_KIND(str);
4608 data = PyUnicode_DATA(str);
4609 len = PyUnicode_GET_LENGTH(str);
4610
4611 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004612 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004613
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004614 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004615 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004616 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004617 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004618 if (v == NULL)
4619 return NULL;
4620
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004621 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004622 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004623 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004624
Antoine Pitrou244651a2009-05-04 18:56:13 +00004625 if (inShift) {
4626 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4627 /* shifting out */
4628 if (base64bits) { /* output remaining bits */
4629 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4630 base64buffer = 0;
4631 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004632 }
4633 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004634 /* Characters not in the BASE64 set implicitly unshift the sequence
4635 so no '-' is required, except if the character is itself a '-' */
4636 if (IS_BASE64(ch) || ch == '-') {
4637 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004638 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004639 *out++ = (char) ch;
4640 }
4641 else {
4642 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004643 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004644 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004645 else { /* not in a shift sequence */
4646 if (ch == '+') {
4647 *out++ = '+';
4648 *out++ = '-';
4649 }
4650 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4651 *out++ = (char) ch;
4652 }
4653 else {
4654 *out++ = '+';
4655 inShift = 1;
4656 goto encode_char;
4657 }
4658 }
4659 continue;
4660encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004661 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004662 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004663
Antoine Pitrou244651a2009-05-04 18:56:13 +00004664 /* code first surrogate */
4665 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004666 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004667 while (base64bits >= 6) {
4668 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4669 base64bits -= 6;
4670 }
4671 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004672 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004673 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004674 base64bits += 16;
4675 base64buffer = (base64buffer << 16) | ch;
4676 while (base64bits >= 6) {
4677 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4678 base64bits -= 6;
4679 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004680 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004681 if (base64bits)
4682 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4683 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004684 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004685 if (_PyBytes_Resize(&v, out - start) < 0)
4686 return NULL;
4687 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004688}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004689PyObject *
4690PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4691 Py_ssize_t size,
4692 int base64SetO,
4693 int base64WhiteSpace,
4694 const char *errors)
4695{
4696 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004697 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004698 if (tmp == NULL)
4699 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004700 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004701 base64WhiteSpace, errors);
4702 Py_DECREF(tmp);
4703 return result;
4704}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004705
Antoine Pitrou244651a2009-05-04 18:56:13 +00004706#undef IS_BASE64
4707#undef FROM_BASE64
4708#undef TO_BASE64
4709#undef DECODE_DIRECT
4710#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004711
Guido van Rossumd57fd912000-03-10 22:53:23 +00004712/* --- UTF-8 Codec -------------------------------------------------------- */
4713
Alexander Belopolsky40018472011-02-26 01:02:56 +00004714PyObject *
4715PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004716 Py_ssize_t size,
4717 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718{
Walter Dörwald69652032004-09-07 20:24:22 +00004719 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4720}
4721
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004722#include "stringlib/asciilib.h"
4723#include "stringlib/codecs.h"
4724#include "stringlib/undef.h"
4725
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004726#include "stringlib/ucs1lib.h"
4727#include "stringlib/codecs.h"
4728#include "stringlib/undef.h"
4729
4730#include "stringlib/ucs2lib.h"
4731#include "stringlib/codecs.h"
4732#include "stringlib/undef.h"
4733
4734#include "stringlib/ucs4lib.h"
4735#include "stringlib/codecs.h"
4736#include "stringlib/undef.h"
4737
Antoine Pitrouab868312009-01-10 15:40:25 +00004738/* Mask to quickly check whether a C 'long' contains a
4739 non-ASCII, UTF8-encoded char. */
4740#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004741# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004742#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004743# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004744#else
4745# error C 'long' size should be either 4 or 8!
4746#endif
4747
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004748static Py_ssize_t
4749ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004750{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004751 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004752 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004753
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004754 /*
4755 * Issue #17237: m68k is a bit different from most architectures in
4756 * that objects do not use "natural alignment" - for example, int and
4757 * long are only aligned at 2-byte boundaries. Therefore the assert()
4758 * won't work; also, tests have shown that skipping the "optimised
4759 * version" will even speed up m68k.
4760 */
4761#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004762#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004763 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4764 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004765 /* Fast path, see in STRINGLIB(utf8_decode) for
4766 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004767 /* Help allocation */
4768 const char *_p = p;
4769 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004770 while (_p < aligned_end) {
4771 unsigned long value = *(const unsigned long *) _p;
4772 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004773 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004774 *((unsigned long *)q) = value;
4775 _p += SIZEOF_LONG;
4776 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004777 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004778 p = _p;
4779 while (p < end) {
4780 if ((unsigned char)*p & 0x80)
4781 break;
4782 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004784 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004786#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004787#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004788 while (p < end) {
4789 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4790 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004791 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004792 /* Help allocation */
4793 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004794 while (_p < aligned_end) {
4795 unsigned long value = *(unsigned long *) _p;
4796 if (value & ASCII_CHAR_MASK)
4797 break;
4798 _p += SIZEOF_LONG;
4799 }
4800 p = _p;
4801 if (_p == end)
4802 break;
4803 }
4804 if ((unsigned char)*p & 0x80)
4805 break;
4806 ++p;
4807 }
4808 memcpy(dest, start, p - start);
4809 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810}
Antoine Pitrouab868312009-01-10 15:40:25 +00004811
Victor Stinner785938e2011-12-11 20:09:03 +01004812PyObject *
4813PyUnicode_DecodeUTF8Stateful(const char *s,
4814 Py_ssize_t size,
4815 const char *errors,
4816 Py_ssize_t *consumed)
4817{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004818 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004819 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004820 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004821
4822 Py_ssize_t startinpos;
4823 Py_ssize_t endinpos;
4824 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004825 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004826 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004827 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004828
4829 if (size == 0) {
4830 if (consumed)
4831 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004832 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004833 }
4834
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004835 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4836 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004837 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004838 *consumed = 1;
4839 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004840 }
4841
Victor Stinner8f674cc2013-04-17 23:02:17 +02004842 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004843 writer.min_length = size;
4844 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004845 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004846
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004847 writer.pos = ascii_decode(s, end, writer.data);
4848 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004849 while (s < end) {
4850 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004851 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004852
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004853 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004854 if (PyUnicode_IS_ASCII(writer.buffer))
4855 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004856 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004857 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004858 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004859 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004860 } else {
4861 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004862 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004863 }
4864
4865 switch (ch) {
4866 case 0:
4867 if (s == end || consumed)
4868 goto End;
4869 errmsg = "unexpected end of data";
4870 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004871 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004872 break;
4873 case 1:
4874 errmsg = "invalid start byte";
4875 startinpos = s - starts;
4876 endinpos = startinpos + 1;
4877 break;
4878 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004879 case 3:
4880 case 4:
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02004881 if (s == end || consumed) {
4882 goto End;
4883 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004884 errmsg = "invalid continuation byte";
4885 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004886 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004887 break;
4888 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004889 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004890 goto onError;
4891 continue;
4892 }
4893
Victor Stinner1d65d912015-10-05 13:43:50 +02004894 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02004895 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02004896
4897 switch (error_handler) {
4898 case _Py_ERROR_IGNORE:
4899 s += (endinpos - startinpos);
4900 break;
4901
4902 case _Py_ERROR_REPLACE:
4903 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4904 goto onError;
4905 s += (endinpos - startinpos);
4906 break;
4907
4908 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004909 {
4910 Py_ssize_t i;
4911
Victor Stinner1d65d912015-10-05 13:43:50 +02004912 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4913 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004914 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004915 ch = (Py_UCS4)(unsigned char)(starts[i]);
4916 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4917 ch + 0xdc00);
4918 writer.pos++;
4919 }
4920 s += (endinpos - startinpos);
4921 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004922 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004923
4924 default:
4925 if (unicode_decode_call_errorhandler_writer(
4926 errors, &error_handler_obj,
4927 "utf-8", errmsg,
4928 &starts, &end, &startinpos, &endinpos, &exc, &s,
4929 &writer))
4930 goto onError;
4931 }
Victor Stinner785938e2011-12-11 20:09:03 +01004932 }
4933
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004934End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004935 if (consumed)
4936 *consumed = s - starts;
4937
Victor Stinner1d65d912015-10-05 13:43:50 +02004938 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004939 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004940 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004941
4942onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02004943 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004944 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004945 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004946 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004947}
4948
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004949
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004950/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
4951 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004952
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004953 On success, write a pointer to a newly allocated wide character string into
4954 *wstr (use PyMem_RawFree() to free the memory) and write the output length
4955 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004956
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004957 On memory allocation failure, return -1.
4958
4959 On decoding error (if surrogateescape is zero), return -2. If wlen is
4960 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
4961 is not NULL, write the decoding error message into *reason. */
4962int
4963_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02004964 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004965{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004966 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004967 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004968 wchar_t *unicode;
4969 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004970
Victor Stinner3d4226a2018-08-29 22:21:32 +02004971 int surrogateescape = 0;
4972 int surrogatepass = 0;
4973 switch (errors)
4974 {
4975 case _Py_ERROR_STRICT:
4976 break;
4977 case _Py_ERROR_SURROGATEESCAPE:
4978 surrogateescape = 1;
4979 break;
4980 case _Py_ERROR_SURROGATEPASS:
4981 surrogatepass = 1;
4982 break;
4983 default:
4984 return -3;
4985 }
4986
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004987 /* Note: size will always be longer than the resulting Unicode
4988 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01004989 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004990 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004991 }
4992
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004993 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01004994 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004995 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004996 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004997
4998 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004999 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005000 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005001 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005002 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005003#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005004 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005005#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005006 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005007#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005008 if (ch > 0xFF) {
5009#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005010 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005011#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005012 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005013 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005014 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5015 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5016#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005017 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005018 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005019 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005020 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005021 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005022
5023 if (surrogateescape) {
5024 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5025 }
5026 else {
5027 /* Is it a valid three-byte code? */
5028 if (surrogatepass
5029 && (e - s) >= 3
5030 && (s[0] & 0xf0) == 0xe0
5031 && (s[1] & 0xc0) == 0x80
5032 && (s[2] & 0xc0) == 0x80)
5033 {
5034 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5035 s += 3;
5036 unicode[outpos++] = ch;
5037 }
5038 else {
5039 PyMem_RawFree(unicode );
5040 if (reason != NULL) {
5041 switch (ch) {
5042 case 0:
5043 *reason = "unexpected end of data";
5044 break;
5045 case 1:
5046 *reason = "invalid start byte";
5047 break;
5048 /* 2, 3, 4 */
5049 default:
5050 *reason = "invalid continuation byte";
5051 break;
5052 }
5053 }
5054 if (wlen != NULL) {
5055 *wlen = s - orig_s;
5056 }
5057 return -2;
5058 }
5059 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005060 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005061 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005062 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005063 if (wlen) {
5064 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005065 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005066 *wstr = unicode;
5067 return 0;
5068}
5069
Victor Stinner5f9cf232019-03-19 01:46:25 +01005070
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005071wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005072_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5073 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005074{
5075 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005076 int res = _Py_DecodeUTF8Ex(arg, arglen,
5077 &wstr, wlen,
5078 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005079 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005080 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5081 assert(res != -3);
5082 if (wlen) {
5083 *wlen = (size_t)res;
5084 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005085 return NULL;
5086 }
5087 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005088}
5089
Antoine Pitrouab868312009-01-10 15:40:25 +00005090
Victor Stinnere47e6982017-12-21 15:45:16 +01005091/* UTF-8 encoder using the surrogateescape error handler .
5092
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005093 On success, return 0 and write the newly allocated character string (use
5094 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005095
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005096 On encoding failure, return -2 and write the position of the invalid
5097 surrogate character into *error_pos (if error_pos is set) and the decoding
5098 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005099
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005100 On memory allocation failure, return -1. */
5101int
5102_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005103 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005104{
5105 const Py_ssize_t max_char_size = 4;
5106 Py_ssize_t len = wcslen(text);
5107
5108 assert(len >= 0);
5109
Victor Stinner3d4226a2018-08-29 22:21:32 +02005110 int surrogateescape = 0;
5111 int surrogatepass = 0;
5112 switch (errors)
5113 {
5114 case _Py_ERROR_STRICT:
5115 break;
5116 case _Py_ERROR_SURROGATEESCAPE:
5117 surrogateescape = 1;
5118 break;
5119 case _Py_ERROR_SURROGATEPASS:
5120 surrogatepass = 1;
5121 break;
5122 default:
5123 return -3;
5124 }
5125
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005126 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5127 return -1;
5128 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005129 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005130 if (raw_malloc) {
5131 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005132 }
5133 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005134 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005135 }
5136 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005137 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005138 }
5139
5140 char *p = bytes;
5141 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005142 for (i = 0; i < len; ) {
5143 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005144 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005145 i++;
5146#if Py_UNICODE_SIZE == 2
5147 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5148 && i < len
5149 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5150 {
5151 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5152 i++;
5153 }
5154#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005155
5156 if (ch < 0x80) {
5157 /* Encode ASCII */
5158 *p++ = (char) ch;
5159
5160 }
5161 else if (ch < 0x0800) {
5162 /* Encode Latin-1 */
5163 *p++ = (char)(0xc0 | (ch >> 6));
5164 *p++ = (char)(0x80 | (ch & 0x3f));
5165 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005166 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005167 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005168 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005169 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005170 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005171 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005172 if (reason != NULL) {
5173 *reason = "encoding error";
5174 }
5175 if (raw_malloc) {
5176 PyMem_RawFree(bytes);
5177 }
5178 else {
5179 PyMem_Free(bytes);
5180 }
5181 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005182 }
5183 *p++ = (char)(ch & 0xff);
5184 }
5185 else if (ch < 0x10000) {
5186 *p++ = (char)(0xe0 | (ch >> 12));
5187 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5188 *p++ = (char)(0x80 | (ch & 0x3f));
5189 }
5190 else { /* ch >= 0x10000 */
5191 assert(ch <= MAX_UNICODE);
5192 /* Encode UCS4 Unicode ordinals */
5193 *p++ = (char)(0xf0 | (ch >> 18));
5194 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5195 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5196 *p++ = (char)(0x80 | (ch & 0x3f));
5197 }
5198 }
5199 *p++ = '\0';
5200
5201 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005202 char *bytes2;
5203 if (raw_malloc) {
5204 bytes2 = PyMem_RawRealloc(bytes, final_size);
5205 }
5206 else {
5207 bytes2 = PyMem_Realloc(bytes, final_size);
5208 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005209 if (bytes2 == NULL) {
5210 if (error_pos != NULL) {
5211 *error_pos = (size_t)-1;
5212 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005213 if (raw_malloc) {
5214 PyMem_RawFree(bytes);
5215 }
5216 else {
5217 PyMem_Free(bytes);
5218 }
5219 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005220 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005221 *str = bytes2;
5222 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005223}
5224
5225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005226/* Primary internal function which creates utf8 encoded bytes objects.
5227
5228 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005229 and allocate exactly as much space needed at the end. Else allocate the
5230 maximum possible needed (4 result bytes per Unicode character), and return
5231 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005232*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005233PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005234_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235{
Victor Stinner6099a032011-12-18 14:22:26 +01005236 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005237 void *data;
5238 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005240 if (!PyUnicode_Check(unicode)) {
5241 PyErr_BadArgument();
5242 return NULL;
5243 }
5244
5245 if (PyUnicode_READY(unicode) == -1)
5246 return NULL;
5247
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005248 if (PyUnicode_UTF8(unicode))
5249 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5250 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005251
5252 kind = PyUnicode_KIND(unicode);
5253 data = PyUnicode_DATA(unicode);
5254 size = PyUnicode_GET_LENGTH(unicode);
5255
Benjamin Petersonead6b532011-12-20 17:23:42 -06005256 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005257 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005258 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005259 case PyUnicode_1BYTE_KIND:
5260 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5261 assert(!PyUnicode_IS_ASCII(unicode));
5262 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5263 case PyUnicode_2BYTE_KIND:
5264 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5265 case PyUnicode_4BYTE_KIND:
5266 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005267 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268}
5269
Alexander Belopolsky40018472011-02-26 01:02:56 +00005270PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005271PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5272 Py_ssize_t size,
5273 const char *errors)
5274{
5275 PyObject *v, *unicode;
5276
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005277 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005278 if (unicode == NULL)
5279 return NULL;
5280 v = _PyUnicode_AsUTF8String(unicode, errors);
5281 Py_DECREF(unicode);
5282 return v;
5283}
5284
5285PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005286PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005288 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289}
5290
Walter Dörwald41980ca2007-08-16 21:55:45 +00005291/* --- UTF-32 Codec ------------------------------------------------------- */
5292
5293PyObject *
5294PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005295 Py_ssize_t size,
5296 const char *errors,
5297 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005298{
5299 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5300}
5301
5302PyObject *
5303PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005304 Py_ssize_t size,
5305 const char *errors,
5306 int *byteorder,
5307 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005308{
5309 const char *starts = s;
5310 Py_ssize_t startinpos;
5311 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005312 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005313 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005314 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005315 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005316 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005317 PyObject *errorHandler = NULL;
5318 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005319
Walter Dörwald41980ca2007-08-16 21:55:45 +00005320 q = (unsigned char *)s;
5321 e = q + size;
5322
5323 if (byteorder)
5324 bo = *byteorder;
5325
5326 /* Check for BOM marks (U+FEFF) in the input and adjust current
5327 byte order setting accordingly. In native mode, the leading BOM
5328 mark is skipped, in all other modes, it is copied to the output
5329 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005330 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005331 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005332 if (bom == 0x0000FEFF) {
5333 bo = -1;
5334 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005335 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005336 else if (bom == 0xFFFE0000) {
5337 bo = 1;
5338 q += 4;
5339 }
5340 if (byteorder)
5341 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005342 }
5343
Victor Stinnere64322e2012-10-30 23:12:47 +01005344 if (q == e) {
5345 if (consumed)
5346 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005347 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005348 }
5349
Victor Stinnere64322e2012-10-30 23:12:47 +01005350#ifdef WORDS_BIGENDIAN
5351 le = bo < 0;
5352#else
5353 le = bo <= 0;
5354#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005355 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005356
Victor Stinner8f674cc2013-04-17 23:02:17 +02005357 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005358 writer.min_length = (e - q + 3) / 4;
5359 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005360 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005361
Victor Stinnere64322e2012-10-30 23:12:47 +01005362 while (1) {
5363 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005364 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005365
Victor Stinnere64322e2012-10-30 23:12:47 +01005366 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005367 enum PyUnicode_Kind kind = writer.kind;
5368 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005369 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005370 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005371 if (le) {
5372 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005373 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005374 if (ch > maxch)
5375 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005376 if (kind != PyUnicode_1BYTE_KIND &&
5377 Py_UNICODE_IS_SURROGATE(ch))
5378 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005379 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005380 q += 4;
5381 } while (q <= last);
5382 }
5383 else {
5384 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005385 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005386 if (ch > maxch)
5387 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005388 if (kind != PyUnicode_1BYTE_KIND &&
5389 Py_UNICODE_IS_SURROGATE(ch))
5390 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005391 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005392 q += 4;
5393 } while (q <= last);
5394 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005395 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005396 }
5397
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005398 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005399 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005400 startinpos = ((const char *)q) - starts;
5401 endinpos = startinpos + 4;
5402 }
5403 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005404 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005405 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005406 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005407 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005408 startinpos = ((const char *)q) - starts;
5409 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005410 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005411 else {
5412 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005413 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005414 goto onError;
5415 q += 4;
5416 continue;
5417 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005418 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005419 startinpos = ((const char *)q) - starts;
5420 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005421 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005422
5423 /* The remaining input chars are ignored if the callback
5424 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005425 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005426 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005427 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005428 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005429 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005430 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005431 }
5432
Walter Dörwald41980ca2007-08-16 21:55:45 +00005433 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005434 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005435
Walter Dörwald41980ca2007-08-16 21:55:45 +00005436 Py_XDECREF(errorHandler);
5437 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005438 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005439
Benjamin Peterson29060642009-01-31 22:14:21 +00005440 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005441 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005442 Py_XDECREF(errorHandler);
5443 Py_XDECREF(exc);
5444 return NULL;
5445}
5446
5447PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005448_PyUnicode_EncodeUTF32(PyObject *str,
5449 const char *errors,
5450 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005451{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005452 enum PyUnicode_Kind kind;
5453 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005454 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005455 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005456 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005457#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005458 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005459#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005460 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005461#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005462 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005463 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005464 PyObject *errorHandler = NULL;
5465 PyObject *exc = NULL;
5466 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005467
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005468 if (!PyUnicode_Check(str)) {
5469 PyErr_BadArgument();
5470 return NULL;
5471 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005472 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005473 return NULL;
5474 kind = PyUnicode_KIND(str);
5475 data = PyUnicode_DATA(str);
5476 len = PyUnicode_GET_LENGTH(str);
5477
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005478 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005479 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005480 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005481 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005482 if (v == NULL)
5483 return NULL;
5484
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005485 /* output buffer is 4-bytes aligned */
5486 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005487 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005488 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005489 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005490 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005491 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005492
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005493 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005494 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005495 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005496 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005497 else
5498 encoding = "utf-32";
5499
5500 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005501 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5502 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005503 }
5504
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005505 pos = 0;
5506 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005507 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005508
5509 if (kind == PyUnicode_2BYTE_KIND) {
5510 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5511 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005512 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005513 else {
5514 assert(kind == PyUnicode_4BYTE_KIND);
5515 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5516 &out, native_ordering);
5517 }
5518 if (pos == len)
5519 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005520
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005521 rep = unicode_encode_call_errorhandler(
5522 errors, &errorHandler,
5523 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005524 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005525 if (!rep)
5526 goto error;
5527
5528 if (PyBytes_Check(rep)) {
5529 repsize = PyBytes_GET_SIZE(rep);
5530 if (repsize & 3) {
5531 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005532 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005533 "surrogates not allowed");
5534 goto error;
5535 }
5536 moreunits = repsize / 4;
5537 }
5538 else {
5539 assert(PyUnicode_Check(rep));
5540 if (PyUnicode_READY(rep) < 0)
5541 goto error;
5542 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5543 if (!PyUnicode_IS_ASCII(rep)) {
5544 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005545 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005546 "surrogates not allowed");
5547 goto error;
5548 }
5549 }
5550
5551 /* four bytes are reserved for each surrogate */
5552 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005553 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005554 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005555 /* integer overflow */
5556 PyErr_NoMemory();
5557 goto error;
5558 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005559 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005560 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005561 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005562 }
5563
5564 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005565 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005566 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005567 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005568 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005569 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5570 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005571 }
5572
5573 Py_CLEAR(rep);
5574 }
5575
5576 /* Cut back to size actually needed. This is necessary for, for example,
5577 encoding of a string containing isolated surrogates and the 'ignore'
5578 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005579 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005580 if (nsize != PyBytes_GET_SIZE(v))
5581 _PyBytes_Resize(&v, nsize);
5582 Py_XDECREF(errorHandler);
5583 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005584 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005585 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005586 error:
5587 Py_XDECREF(rep);
5588 Py_XDECREF(errorHandler);
5589 Py_XDECREF(exc);
5590 Py_XDECREF(v);
5591 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005592}
5593
Alexander Belopolsky40018472011-02-26 01:02:56 +00005594PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005595PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5596 Py_ssize_t size,
5597 const char *errors,
5598 int byteorder)
5599{
5600 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005601 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005602 if (tmp == NULL)
5603 return NULL;
5604 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5605 Py_DECREF(tmp);
5606 return result;
5607}
5608
5609PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005610PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005611{
Victor Stinnerb960b342011-11-20 19:12:52 +01005612 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005613}
5614
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615/* --- UTF-16 Codec ------------------------------------------------------- */
5616
Tim Peters772747b2001-08-09 22:21:55 +00005617PyObject *
5618PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005619 Py_ssize_t size,
5620 const char *errors,
5621 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622{
Walter Dörwald69652032004-09-07 20:24:22 +00005623 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5624}
5625
5626PyObject *
5627PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005628 Py_ssize_t size,
5629 const char *errors,
5630 int *byteorder,
5631 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005632{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005633 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005634 Py_ssize_t startinpos;
5635 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005636 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005637 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005638 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005639 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005640 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005641 PyObject *errorHandler = NULL;
5642 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005643 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644
Tim Peters772747b2001-08-09 22:21:55 +00005645 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005646 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647
5648 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005649 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005651 /* Check for BOM marks (U+FEFF) in the input and adjust current
5652 byte order setting accordingly. In native mode, the leading BOM
5653 mark is skipped, in all other modes, it is copied to the output
5654 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005655 if (bo == 0 && size >= 2) {
5656 const Py_UCS4 bom = (q[1] << 8) | q[0];
5657 if (bom == 0xFEFF) {
5658 q += 2;
5659 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005660 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005661 else if (bom == 0xFFFE) {
5662 q += 2;
5663 bo = 1;
5664 }
5665 if (byteorder)
5666 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005667 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668
Antoine Pitrou63065d72012-05-15 23:48:04 +02005669 if (q == e) {
5670 if (consumed)
5671 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005672 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005673 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005674
Christian Heimes743e0cd2012-10-17 23:52:17 +02005675#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005676 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005677 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005678#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005679 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005680 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005681#endif
Tim Peters772747b2001-08-09 22:21:55 +00005682
Antoine Pitrou63065d72012-05-15 23:48:04 +02005683 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005684 character count normally. Error handler will take care of
5685 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005686 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005687 writer.min_length = (e - q + 1) / 2;
5688 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005689 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005690
Antoine Pitrou63065d72012-05-15 23:48:04 +02005691 while (1) {
5692 Py_UCS4 ch = 0;
5693 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005694 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005695 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005696 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005697 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005698 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005699 native_ordering);
5700 else
5701 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005702 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005703 native_ordering);
5704 } else if (kind == PyUnicode_2BYTE_KIND) {
5705 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005706 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005707 native_ordering);
5708 } else {
5709 assert(kind == PyUnicode_4BYTE_KIND);
5710 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005711 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005712 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005713 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005714 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005715
Antoine Pitrou63065d72012-05-15 23:48:04 +02005716 switch (ch)
5717 {
5718 case 0:
5719 /* remaining byte at the end? (size should be even) */
5720 if (q == e || consumed)
5721 goto End;
5722 errmsg = "truncated data";
5723 startinpos = ((const char *)q) - starts;
5724 endinpos = ((const char *)e) - starts;
5725 break;
5726 /* The remaining input chars are ignored if the callback
5727 chooses to skip the input */
5728 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005729 q -= 2;
5730 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005731 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005732 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005733 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005734 endinpos = ((const char *)e) - starts;
5735 break;
5736 case 2:
5737 errmsg = "illegal encoding";
5738 startinpos = ((const char *)q) - 2 - starts;
5739 endinpos = startinpos + 2;
5740 break;
5741 case 3:
5742 errmsg = "illegal UTF-16 surrogate";
5743 startinpos = ((const char *)q) - 4 - starts;
5744 endinpos = startinpos + 2;
5745 break;
5746 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005747 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005748 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005749 continue;
5750 }
5751
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005752 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005753 errors,
5754 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005755 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005756 &starts,
5757 (const char **)&e,
5758 &startinpos,
5759 &endinpos,
5760 &exc,
5761 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005762 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005763 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764 }
5765
Antoine Pitrou63065d72012-05-15 23:48:04 +02005766End:
Walter Dörwald69652032004-09-07 20:24:22 +00005767 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005769
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005770 Py_XDECREF(errorHandler);
5771 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005772 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773
Benjamin Peterson29060642009-01-31 22:14:21 +00005774 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005775 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005776 Py_XDECREF(errorHandler);
5777 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 return NULL;
5779}
5780
Tim Peters772747b2001-08-09 22:21:55 +00005781PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005782_PyUnicode_EncodeUTF16(PyObject *str,
5783 const char *errors,
5784 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005786 enum PyUnicode_Kind kind;
5787 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005788 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005789 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005790 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005791 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005792#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005793 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005794#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005795 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005796#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005797 const char *encoding;
5798 Py_ssize_t nsize, pos;
5799 PyObject *errorHandler = NULL;
5800 PyObject *exc = NULL;
5801 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005802
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005803 if (!PyUnicode_Check(str)) {
5804 PyErr_BadArgument();
5805 return NULL;
5806 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005807 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005808 return NULL;
5809 kind = PyUnicode_KIND(str);
5810 data = PyUnicode_DATA(str);
5811 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005812
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005813 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005814 if (kind == PyUnicode_4BYTE_KIND) {
5815 const Py_UCS4 *in = (const Py_UCS4 *)data;
5816 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005817 while (in < end) {
5818 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005819 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005820 }
5821 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005822 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005823 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005824 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005825 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005826 nsize = len + pairs + (byteorder == 0);
5827 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005828 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005830 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005832 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005833 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005834 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005835 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005836 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005837 }
5838 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005839 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005840 }
Tim Peters772747b2001-08-09 22:21:55 +00005841
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005842 if (kind == PyUnicode_1BYTE_KIND) {
5843 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5844 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005845 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005846
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005847 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005848 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005849 }
5850 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005851 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005852 }
5853 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005854 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005855 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005856
5857 pos = 0;
5858 while (pos < len) {
5859 Py_ssize_t repsize, moreunits;
5860
5861 if (kind == PyUnicode_2BYTE_KIND) {
5862 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5863 &out, native_ordering);
5864 }
5865 else {
5866 assert(kind == PyUnicode_4BYTE_KIND);
5867 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5868 &out, native_ordering);
5869 }
5870 if (pos == len)
5871 break;
5872
5873 rep = unicode_encode_call_errorhandler(
5874 errors, &errorHandler,
5875 encoding, "surrogates not allowed",
5876 str, &exc, pos, pos + 1, &pos);
5877 if (!rep)
5878 goto error;
5879
5880 if (PyBytes_Check(rep)) {
5881 repsize = PyBytes_GET_SIZE(rep);
5882 if (repsize & 1) {
5883 raise_encode_exception(&exc, encoding,
5884 str, pos - 1, pos,
5885 "surrogates not allowed");
5886 goto error;
5887 }
5888 moreunits = repsize / 2;
5889 }
5890 else {
5891 assert(PyUnicode_Check(rep));
5892 if (PyUnicode_READY(rep) < 0)
5893 goto error;
5894 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5895 if (!PyUnicode_IS_ASCII(rep)) {
5896 raise_encode_exception(&exc, encoding,
5897 str, pos - 1, pos,
5898 "surrogates not allowed");
5899 goto error;
5900 }
5901 }
5902
5903 /* two bytes are reserved for each surrogate */
5904 if (moreunits > 1) {
5905 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005906 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005907 /* integer overflow */
5908 PyErr_NoMemory();
5909 goto error;
5910 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005911 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005912 goto error;
5913 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5914 }
5915
5916 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005917 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005918 out += moreunits;
5919 } else /* rep is unicode */ {
5920 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5921 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5922 &out, native_ordering);
5923 }
5924
5925 Py_CLEAR(rep);
5926 }
5927
5928 /* Cut back to size actually needed. This is necessary for, for example,
5929 encoding of a string containing isolated surrogates and the 'ignore' handler
5930 is used. */
5931 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5932 if (nsize != PyBytes_GET_SIZE(v))
5933 _PyBytes_Resize(&v, nsize);
5934 Py_XDECREF(errorHandler);
5935 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005936 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005937 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005938 error:
5939 Py_XDECREF(rep);
5940 Py_XDECREF(errorHandler);
5941 Py_XDECREF(exc);
5942 Py_XDECREF(v);
5943 return NULL;
5944#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945}
5946
Alexander Belopolsky40018472011-02-26 01:02:56 +00005947PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005948PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5949 Py_ssize_t size,
5950 const char *errors,
5951 int byteorder)
5952{
5953 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005954 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005955 if (tmp == NULL)
5956 return NULL;
5957 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5958 Py_DECREF(tmp);
5959 return result;
5960}
5961
5962PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005963PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005965 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966}
5967
5968/* --- Unicode Escape Codec ----------------------------------------------- */
5969
Fredrik Lundh06d12682001-01-24 07:59:11 +00005970static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005971
Alexander Belopolsky40018472011-02-26 01:02:56 +00005972PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005973_PyUnicode_DecodeUnicodeEscape(const char *s,
5974 Py_ssize_t size,
5975 const char *errors,
5976 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005978 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005979 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005981 PyObject *errorHandler = NULL;
5982 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005983
Eric V. Smith42454af2016-10-31 09:22:08 -04005984 // so we can remember if we've seen an invalid escape char or not
5985 *first_invalid_escape = NULL;
5986
Victor Stinner62ec3312016-09-06 17:04:34 -07005987 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005988 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005989 }
5990 /* Escaped strings will always be longer than the resulting
5991 Unicode string, so we start with size here and then reduce the
5992 length after conversion to the true value.
5993 (but if the error callback returns a long replacement string
5994 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005995 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005996 writer.min_length = size;
5997 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5998 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005999 }
6000
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001 end = s + size;
6002 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006003 unsigned char c = (unsigned char) *s++;
6004 Py_UCS4 ch;
6005 int count;
6006 Py_ssize_t startinpos;
6007 Py_ssize_t endinpos;
6008 const char *message;
6009
6010#define WRITE_ASCII_CHAR(ch) \
6011 do { \
6012 assert(ch <= 127); \
6013 assert(writer.pos < writer.size); \
6014 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6015 } while(0)
6016
6017#define WRITE_CHAR(ch) \
6018 do { \
6019 if (ch <= writer.maxchar) { \
6020 assert(writer.pos < writer.size); \
6021 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6022 } \
6023 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6024 goto onError; \
6025 } \
6026 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027
6028 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006029 if (c != '\\') {
6030 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 continue;
6032 }
6033
Victor Stinner62ec3312016-09-06 17:04:34 -07006034 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006036 if (s >= end) {
6037 message = "\\ at end of string";
6038 goto error;
6039 }
6040 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006041
Victor Stinner62ec3312016-09-06 17:04:34 -07006042 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006043 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044
Benjamin Peterson29060642009-01-31 22:14:21 +00006045 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006046 case '\n': continue;
6047 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6048 case '\'': WRITE_ASCII_CHAR('\''); continue;
6049 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6050 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006051 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006052 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6053 case 't': WRITE_ASCII_CHAR('\t'); continue;
6054 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6055 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006056 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006057 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006058 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006059 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060
Benjamin Peterson29060642009-01-31 22:14:21 +00006061 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 case '0': case '1': case '2': case '3':
6063 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006064 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006065 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006066 ch = (ch<<3) + *s++ - '0';
6067 if (s < end && '0' <= *s && *s <= '7') {
6068 ch = (ch<<3) + *s++ - '0';
6069 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006071 WRITE_CHAR(ch);
6072 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073
Benjamin Peterson29060642009-01-31 22:14:21 +00006074 /* hex escapes */
6075 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006077 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006078 message = "truncated \\xXX escape";
6079 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080
Benjamin Peterson29060642009-01-31 22:14:21 +00006081 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006083 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006084 message = "truncated \\uXXXX escape";
6085 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086
Benjamin Peterson29060642009-01-31 22:14:21 +00006087 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006088 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006089 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006090 message = "truncated \\UXXXXXXXX escape";
6091 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006092 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006093 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006094 ch <<= 4;
6095 if (c >= '0' && c <= '9') {
6096 ch += c - '0';
6097 }
6098 else if (c >= 'a' && c <= 'f') {
6099 ch += c - ('a' - 10);
6100 }
6101 else if (c >= 'A' && c <= 'F') {
6102 ch += c - ('A' - 10);
6103 }
6104 else {
6105 break;
6106 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006107 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006108 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006109 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006110 }
6111
6112 /* when we get here, ch is a 32-bit unicode character */
6113 if (ch > MAX_UNICODE) {
6114 message = "illegal Unicode character";
6115 goto error;
6116 }
6117
6118 WRITE_CHAR(ch);
6119 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006120
Benjamin Peterson29060642009-01-31 22:14:21 +00006121 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006122 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006123 if (ucnhash_CAPI == NULL) {
6124 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006125 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6126 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006127 if (ucnhash_CAPI == NULL) {
6128 PyErr_SetString(
6129 PyExc_UnicodeError,
6130 "\\N escapes not supported (can't load unicodedata module)"
6131 );
6132 goto onError;
6133 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006134 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006135
6136 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006137 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006138 const char *start = ++s;
6139 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006140 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006141 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006142 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006143 namelen = s - start;
6144 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006145 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006146 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006147 ch = 0xffffffff; /* in case 'getcode' messes up */
6148 if (namelen <= INT_MAX &&
6149 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6150 &ch, 0)) {
6151 assert(ch <= MAX_UNICODE);
6152 WRITE_CHAR(ch);
6153 continue;
6154 }
6155 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006156 }
6157 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006158 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006159
6160 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006161 if (*first_invalid_escape == NULL) {
6162 *first_invalid_escape = s-1; /* Back up one char, since we've
6163 already incremented s. */
6164 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006165 WRITE_ASCII_CHAR('\\');
6166 WRITE_CHAR(c);
6167 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006169
6170 error:
6171 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006172 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006173 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006174 errors, &errorHandler,
6175 "unicodeescape", message,
6176 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006177 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006178 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006179 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006180 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006181
6182#undef WRITE_ASCII_CHAR
6183#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006185
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006186 Py_XDECREF(errorHandler);
6187 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006188 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006189
Benjamin Peterson29060642009-01-31 22:14:21 +00006190 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006191 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006192 Py_XDECREF(errorHandler);
6193 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194 return NULL;
6195}
6196
Eric V. Smith42454af2016-10-31 09:22:08 -04006197PyObject *
6198PyUnicode_DecodeUnicodeEscape(const char *s,
6199 Py_ssize_t size,
6200 const char *errors)
6201{
6202 const char *first_invalid_escape;
6203 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6204 &first_invalid_escape);
6205 if (result == NULL)
6206 return NULL;
6207 if (first_invalid_escape != NULL) {
6208 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6209 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006210 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006211 Py_DECREF(result);
6212 return NULL;
6213 }
6214 }
6215 return result;
6216}
6217
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006218/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219
Alexander Belopolsky40018472011-02-26 01:02:56 +00006220PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006221PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006223 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006224 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006226 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006227 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006228 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229
Ezio Melottie7f90372012-10-05 03:33:31 +03006230 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006231 escape.
6232
Ezio Melottie7f90372012-10-05 03:33:31 +03006233 For UCS1 strings it's '\xxx', 4 bytes per source character.
6234 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6235 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006236 */
6237
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006238 if (!PyUnicode_Check(unicode)) {
6239 PyErr_BadArgument();
6240 return NULL;
6241 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006242 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006243 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006244 }
Victor Stinner358af132015-10-12 22:36:57 +02006245
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006246 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006247 if (len == 0) {
6248 return PyBytes_FromStringAndSize(NULL, 0);
6249 }
6250
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006251 kind = PyUnicode_KIND(unicode);
6252 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006253 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6254 bytes, and 1 byte characters 4. */
6255 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006256 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006257 return PyErr_NoMemory();
6258 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006259 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006260 if (repr == NULL) {
6261 return NULL;
6262 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006263
Victor Stinner62ec3312016-09-06 17:04:34 -07006264 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006265 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006266 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006267
Victor Stinner62ec3312016-09-06 17:04:34 -07006268 /* U+0000-U+00ff range */
6269 if (ch < 0x100) {
6270 if (ch >= ' ' && ch < 127) {
6271 if (ch != '\\') {
6272 /* Copy printable US ASCII as-is */
6273 *p++ = (char) ch;
6274 }
6275 /* Escape backslashes */
6276 else {
6277 *p++ = '\\';
6278 *p++ = '\\';
6279 }
6280 }
Victor Stinner358af132015-10-12 22:36:57 +02006281
Victor Stinner62ec3312016-09-06 17:04:34 -07006282 /* Map special whitespace to '\t', \n', '\r' */
6283 else if (ch == '\t') {
6284 *p++ = '\\';
6285 *p++ = 't';
6286 }
6287 else if (ch == '\n') {
6288 *p++ = '\\';
6289 *p++ = 'n';
6290 }
6291 else if (ch == '\r') {
6292 *p++ = '\\';
6293 *p++ = 'r';
6294 }
6295
6296 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6297 else {
6298 *p++ = '\\';
6299 *p++ = 'x';
6300 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6301 *p++ = Py_hexdigits[ch & 0x000F];
6302 }
Tim Petersced69f82003-09-16 20:30:58 +00006303 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006304 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006305 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306 *p++ = '\\';
6307 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006308 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6309 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6310 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6311 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006313 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6314 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006315
Victor Stinner62ec3312016-09-06 17:04:34 -07006316 /* Make sure that the first two digits are zero */
6317 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006318 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006319 *p++ = 'U';
6320 *p++ = '0';
6321 *p++ = '0';
6322 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6323 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6324 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6325 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6326 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6327 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006328 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006329 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330
Victor Stinner62ec3312016-09-06 17:04:34 -07006331 assert(p - PyBytes_AS_STRING(repr) > 0);
6332 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6333 return NULL;
6334 }
6335 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336}
6337
Alexander Belopolsky40018472011-02-26 01:02:56 +00006338PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006339PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6340 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006342 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006343 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006344 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006346 }
6347
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006348 result = PyUnicode_AsUnicodeEscapeString(tmp);
6349 Py_DECREF(tmp);
6350 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351}
6352
6353/* --- Raw Unicode Escape Codec ------------------------------------------- */
6354
Alexander Belopolsky40018472011-02-26 01:02:56 +00006355PyObject *
6356PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006357 Py_ssize_t size,
6358 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006360 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006361 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006363 PyObject *errorHandler = NULL;
6364 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006365
Victor Stinner62ec3312016-09-06 17:04:34 -07006366 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006367 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006368 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006369
Guido van Rossumd57fd912000-03-10 22:53:23 +00006370 /* Escaped strings will always be longer than the resulting
6371 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006372 length after conversion to the true value. (But decoding error
6373 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006374 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006375 writer.min_length = size;
6376 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6377 goto onError;
6378 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006379
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380 end = s + size;
6381 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006382 unsigned char c = (unsigned char) *s++;
6383 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006384 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006385 Py_ssize_t startinpos;
6386 Py_ssize_t endinpos;
6387 const char *message;
6388
6389#define WRITE_CHAR(ch) \
6390 do { \
6391 if (ch <= writer.maxchar) { \
6392 assert(writer.pos < writer.size); \
6393 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6394 } \
6395 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6396 goto onError; \
6397 } \
6398 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399
Benjamin Peterson29060642009-01-31 22:14:21 +00006400 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006401 if (c != '\\' || s >= end) {
6402 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006403 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006404 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006405
Victor Stinner62ec3312016-09-06 17:04:34 -07006406 c = (unsigned char) *s++;
6407 if (c == 'u') {
6408 count = 4;
6409 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006410 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006411 else if (c == 'U') {
6412 count = 8;
6413 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006414 }
6415 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006416 assert(writer.pos < writer.size);
6417 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6418 WRITE_CHAR(c);
6419 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006420 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006421 startinpos = s - starts - 2;
6422
6423 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6424 for (ch = 0; count && s < end; ++s, --count) {
6425 c = (unsigned char)*s;
6426 ch <<= 4;
6427 if (c >= '0' && c <= '9') {
6428 ch += c - '0';
6429 }
6430 else if (c >= 'a' && c <= 'f') {
6431 ch += c - ('a' - 10);
6432 }
6433 else if (c >= 'A' && c <= 'F') {
6434 ch += c - ('A' - 10);
6435 }
6436 else {
6437 break;
6438 }
6439 }
6440 if (!count) {
6441 if (ch <= MAX_UNICODE) {
6442 WRITE_CHAR(ch);
6443 continue;
6444 }
6445 message = "\\Uxxxxxxxx out of range";
6446 }
6447
6448 endinpos = s-starts;
6449 writer.min_length = end - s + writer.pos;
6450 if (unicode_decode_call_errorhandler_writer(
6451 errors, &errorHandler,
6452 "rawunicodeescape", message,
6453 &starts, &end, &startinpos, &endinpos, &exc, &s,
6454 &writer)) {
6455 goto onError;
6456 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006457 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006458
6459#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006461 Py_XDECREF(errorHandler);
6462 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006463 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006464
Benjamin Peterson29060642009-01-31 22:14:21 +00006465 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006466 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006467 Py_XDECREF(errorHandler);
6468 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006470
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471}
6472
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006473
Alexander Belopolsky40018472011-02-26 01:02:56 +00006474PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006475PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476{
Victor Stinner62ec3312016-09-06 17:04:34 -07006477 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006479 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006480 int kind;
6481 void *data;
6482 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006484 if (!PyUnicode_Check(unicode)) {
6485 PyErr_BadArgument();
6486 return NULL;
6487 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006488 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006489 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006490 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006491 kind = PyUnicode_KIND(unicode);
6492 data = PyUnicode_DATA(unicode);
6493 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006494 if (kind == PyUnicode_1BYTE_KIND) {
6495 return PyBytes_FromStringAndSize(data, len);
6496 }
Victor Stinner0e368262011-11-10 20:12:49 +01006497
Victor Stinner62ec3312016-09-06 17:04:34 -07006498 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6499 bytes, and 1 byte characters 4. */
6500 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006501
Victor Stinner62ec3312016-09-06 17:04:34 -07006502 if (len > PY_SSIZE_T_MAX / expandsize) {
6503 return PyErr_NoMemory();
6504 }
6505 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6506 if (repr == NULL) {
6507 return NULL;
6508 }
6509 if (len == 0) {
6510 return repr;
6511 }
6512
6513 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006514 for (pos = 0; pos < len; pos++) {
6515 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006516
Victor Stinner62ec3312016-09-06 17:04:34 -07006517 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6518 if (ch < 0x100) {
6519 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006520 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006521 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006522 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523 *p++ = '\\';
6524 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006525 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6526 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6527 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6528 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006530 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6531 else {
6532 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6533 *p++ = '\\';
6534 *p++ = 'U';
6535 *p++ = '0';
6536 *p++ = '0';
6537 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6538 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6539 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6540 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6541 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6542 *p++ = Py_hexdigits[ch & 15];
6543 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006545
Victor Stinner62ec3312016-09-06 17:04:34 -07006546 assert(p > PyBytes_AS_STRING(repr));
6547 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6548 return NULL;
6549 }
6550 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551}
6552
Alexander Belopolsky40018472011-02-26 01:02:56 +00006553PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006554PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6555 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006557 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006558 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006559 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006560 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006561 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6562 Py_DECREF(tmp);
6563 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564}
6565
6566/* --- Latin-1 Codec ------------------------------------------------------ */
6567
Alexander Belopolsky40018472011-02-26 01:02:56 +00006568PyObject *
6569PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006570 Py_ssize_t size,
6571 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006574 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575}
6576
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006577/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006578static void
6579make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006580 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006581 PyObject *unicode,
6582 Py_ssize_t startpos, Py_ssize_t endpos,
6583 const char *reason)
6584{
6585 if (*exceptionObject == NULL) {
6586 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006587 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006588 encoding, unicode, startpos, endpos, reason);
6589 }
6590 else {
6591 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6592 goto onError;
6593 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6594 goto onError;
6595 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6596 goto onError;
6597 return;
6598 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006599 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006600 }
6601}
6602
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006603/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006604static void
6605raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006606 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006607 PyObject *unicode,
6608 Py_ssize_t startpos, Py_ssize_t endpos,
6609 const char *reason)
6610{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006611 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006612 encoding, unicode, startpos, endpos, reason);
6613 if (*exceptionObject != NULL)
6614 PyCodec_StrictErrors(*exceptionObject);
6615}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006616
6617/* error handling callback helper:
6618 build arguments, call the callback and check the arguments,
6619 put the result into newpos and return the replacement string, which
6620 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006621static PyObject *
6622unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006623 PyObject **errorHandler,
6624 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006625 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006626 Py_ssize_t startpos, Py_ssize_t endpos,
6627 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006628{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006629 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006630 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006631 PyObject *restuple;
6632 PyObject *resunicode;
6633
6634 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006635 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006636 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006637 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006638 }
6639
Benjamin Petersonbac79492012-01-14 13:34:47 -05006640 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006641 return NULL;
6642 len = PyUnicode_GET_LENGTH(unicode);
6643
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006644 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006645 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006646 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006647 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006648
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006649 restuple = PyObject_CallFunctionObjArgs(
6650 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006651 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006652 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006653 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006654 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006655 Py_DECREF(restuple);
6656 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006657 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006658 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006659 &resunicode, newpos)) {
6660 Py_DECREF(restuple);
6661 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006662 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006663 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6664 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6665 Py_DECREF(restuple);
6666 return NULL;
6667 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006668 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006669 *newpos = len + *newpos;
6670 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006671 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 Py_DECREF(restuple);
6673 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006674 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006675 Py_INCREF(resunicode);
6676 Py_DECREF(restuple);
6677 return resunicode;
6678}
6679
Alexander Belopolsky40018472011-02-26 01:02:56 +00006680static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006681unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006682 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006683 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006684{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006685 /* input state */
6686 Py_ssize_t pos=0, size;
6687 int kind;
6688 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006689 /* pointer into the output */
6690 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006691 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6692 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006693 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006694 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006695 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006696 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006697 /* output object */
6698 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006699
Benjamin Petersonbac79492012-01-14 13:34:47 -05006700 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006701 return NULL;
6702 size = PyUnicode_GET_LENGTH(unicode);
6703 kind = PyUnicode_KIND(unicode);
6704 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006705 /* allocate enough for a simple encoding without
6706 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006707 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006708 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006709
6710 _PyBytesWriter_Init(&writer);
6711 str = _PyBytesWriter_Alloc(&writer, size);
6712 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006713 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006714
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006715 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006716 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006717
Benjamin Peterson29060642009-01-31 22:14:21 +00006718 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006719 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006720 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006721 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006722 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006723 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006724 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006725 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006726 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006727 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006728 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006729 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006730
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006731 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006732 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006733
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006734 /* Only overallocate the buffer if it's not the last write */
6735 writer.overallocate = (collend < size);
6736
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006738 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006739 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006740
6741 switch (error_handler) {
6742 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006743 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006744 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006745
6746 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006747 memset(str, '?', collend - collstart);
6748 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006749 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006750 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006751 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006752 break;
Victor Stinner50149202015-09-22 00:26:54 +02006753
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006754 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006755 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006756 writer.min_size -= (collend - collstart);
6757 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006758 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006759 if (str == NULL)
6760 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006761 pos = collend;
6762 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006763
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006764 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006765 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006766 writer.min_size -= (collend - collstart);
6767 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006768 unicode, collstart, collend);
6769 if (str == NULL)
6770 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006771 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006772 break;
Victor Stinner50149202015-09-22 00:26:54 +02006773
Victor Stinnerc3713e92015-09-29 12:32:13 +02006774 case _Py_ERROR_SURROGATEESCAPE:
6775 for (i = collstart; i < collend; ++i) {
6776 ch = PyUnicode_READ(kind, data, i);
6777 if (ch < 0xdc80 || 0xdcff < ch) {
6778 /* Not a UTF-8b surrogate */
6779 break;
6780 }
6781 *str++ = (char)(ch - 0xdc00);
6782 ++pos;
6783 }
6784 if (i >= collend)
6785 break;
6786 collstart = pos;
6787 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006788 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006789
Benjamin Peterson29060642009-01-31 22:14:21 +00006790 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006791 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6792 encoding, reason, unicode, &exc,
6793 collstart, collend, &newpos);
6794 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006795 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006796
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006797 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006798 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006799
Victor Stinner6bd525b2015-10-09 13:10:05 +02006800 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006801 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006802 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006803 PyBytes_AS_STRING(rep),
6804 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006805 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006806 else {
6807 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006808
Victor Stinner6bd525b2015-10-09 13:10:05 +02006809 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006810 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006811
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006812 if (limit == 256 ?
6813 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6814 !PyUnicode_IS_ASCII(rep))
6815 {
6816 /* Not all characters are smaller than limit */
6817 raise_encode_exception(&exc, encoding, unicode,
6818 collstart, collend, reason);
6819 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006820 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006821 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6822 str = _PyBytesWriter_WriteBytes(&writer, str,
6823 PyUnicode_DATA(rep),
6824 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006825 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006826 if (str == NULL)
6827 goto onError;
6828
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006829 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006830 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006831 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006832
6833 /* If overallocation was disabled, ensure that it was the last
6834 write. Otherwise, we missed an optimization */
6835 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006836 }
6837 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006838
Victor Stinner50149202015-09-22 00:26:54 +02006839 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006840 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006841 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006842
6843 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006844 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006845 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006846 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006847 Py_XDECREF(exc);
6848 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006849}
6850
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006851/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006852PyObject *
6853PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006854 Py_ssize_t size,
6855 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006857 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006858 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006859 if (unicode == NULL)
6860 return NULL;
6861 result = unicode_encode_ucs1(unicode, errors, 256);
6862 Py_DECREF(unicode);
6863 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864}
6865
Alexander Belopolsky40018472011-02-26 01:02:56 +00006866PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006867_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868{
6869 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006870 PyErr_BadArgument();
6871 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006873 if (PyUnicode_READY(unicode) == -1)
6874 return NULL;
6875 /* Fast path: if it is a one-byte string, construct
6876 bytes object directly. */
6877 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6878 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6879 PyUnicode_GET_LENGTH(unicode));
6880 /* Non-Latin-1 characters present. Defer to above function to
6881 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006882 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006883}
6884
6885PyObject*
6886PyUnicode_AsLatin1String(PyObject *unicode)
6887{
6888 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889}
6890
6891/* --- 7-bit ASCII Codec -------------------------------------------------- */
6892
Alexander Belopolsky40018472011-02-26 01:02:56 +00006893PyObject *
6894PyUnicode_DecodeASCII(const char *s,
6895 Py_ssize_t size,
6896 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006898 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006899 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006900 int kind;
6901 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006902 Py_ssize_t startinpos;
6903 Py_ssize_t endinpos;
6904 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006905 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006906 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006907 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006908 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006909
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006911 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006912
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006914 if (size == 1 && (unsigned char)s[0] < 128)
6915 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006916
Victor Stinner8f674cc2013-04-17 23:02:17 +02006917 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006918 writer.min_length = size;
6919 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006920 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006921
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006922 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006923 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006924 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006925 writer.pos = outpos;
6926 if (writer.pos == size)
6927 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006928
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006929 s += writer.pos;
6930 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006931 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006932 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006933 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006934 PyUnicode_WRITE(kind, data, writer.pos, c);
6935 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006936 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006937 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006938 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006939
6940 /* byte outsize range 0x00..0x7f: call the error handler */
6941
6942 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006943 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02006944
6945 switch (error_handler)
6946 {
6947 case _Py_ERROR_REPLACE:
6948 case _Py_ERROR_SURROGATEESCAPE:
6949 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006950 but we may switch to UCS2 at the first write */
6951 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6952 goto onError;
6953 kind = writer.kind;
6954 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006955
6956 if (error_handler == _Py_ERROR_REPLACE)
6957 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6958 else
6959 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6960 writer.pos++;
6961 ++s;
6962 break;
6963
6964 case _Py_ERROR_IGNORE:
6965 ++s;
6966 break;
6967
6968 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00006969 startinpos = s-starts;
6970 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006971 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02006972 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 "ascii", "ordinal not in range(128)",
6974 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006975 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006976 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006977 kind = writer.kind;
6978 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006979 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006981 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006982 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006983 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006984
Benjamin Peterson29060642009-01-31 22:14:21 +00006985 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006986 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02006987 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006988 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989 return NULL;
6990}
6991
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006992/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006993PyObject *
6994PyUnicode_EncodeASCII(const Py_UNICODE *p,
6995 Py_ssize_t size,
6996 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006998 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006999 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007000 if (unicode == NULL)
7001 return NULL;
7002 result = unicode_encode_ucs1(unicode, errors, 128);
7003 Py_DECREF(unicode);
7004 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005}
7006
Alexander Belopolsky40018472011-02-26 01:02:56 +00007007PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007008_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009{
7010 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007011 PyErr_BadArgument();
7012 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007014 if (PyUnicode_READY(unicode) == -1)
7015 return NULL;
7016 /* Fast path: if it is an ASCII-only string, construct bytes object
7017 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007018 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007019 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7020 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007021 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007022}
7023
7024PyObject *
7025PyUnicode_AsASCIIString(PyObject *unicode)
7026{
7027 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028}
7029
Steve Dowercc16be82016-09-08 10:35:16 -07007030#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007031
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007032/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007033
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007034#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007035#define NEED_RETRY
7036#endif
7037
Victor Stinner3a50e702011-10-18 21:21:00 +02007038#ifndef WC_ERR_INVALID_CHARS
7039# define WC_ERR_INVALID_CHARS 0x0080
7040#endif
7041
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007042static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007043code_page_name(UINT code_page, PyObject **obj)
7044{
7045 *obj = NULL;
7046 if (code_page == CP_ACP)
7047 return "mbcs";
7048 if (code_page == CP_UTF7)
7049 return "CP_UTF7";
7050 if (code_page == CP_UTF8)
7051 return "CP_UTF8";
7052
7053 *obj = PyBytes_FromFormat("cp%u", code_page);
7054 if (*obj == NULL)
7055 return NULL;
7056 return PyBytes_AS_STRING(*obj);
7057}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007058
Victor Stinner3a50e702011-10-18 21:21:00 +02007059static DWORD
7060decode_code_page_flags(UINT code_page)
7061{
7062 if (code_page == CP_UTF7) {
7063 /* The CP_UTF7 decoder only supports flags=0 */
7064 return 0;
7065 }
7066 else
7067 return MB_ERR_INVALID_CHARS;
7068}
7069
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007070/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007071 * Decode a byte string from a Windows code page into unicode object in strict
7072 * mode.
7073 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007074 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7075 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007076 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007077static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007078decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007079 wchar_t **buf,
7080 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007081 const char *in,
7082 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007083{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007084 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007085 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007086 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007087
7088 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007089 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007090 while ((outsize = MultiByteToWideChar(code_page, flags,
7091 in, insize, NULL, 0)) <= 0)
7092 {
7093 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7094 goto error;
7095 }
7096 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7097 flags = 0;
7098 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007099
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007100 /* Extend a wchar_t* buffer */
7101 Py_ssize_t n = *bufsize; /* Get the current length */
7102 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7103 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007104 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007105 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007106
7107 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007108 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7109 if (outsize <= 0)
7110 goto error;
7111 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007112
Victor Stinner3a50e702011-10-18 21:21:00 +02007113error:
7114 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7115 return -2;
7116 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007117 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007118}
7119
Victor Stinner3a50e702011-10-18 21:21:00 +02007120/*
7121 * Decode a byte string from a code page into unicode object with an error
7122 * handler.
7123 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007124 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007125 * UnicodeDecodeError exception and returns -1 on error.
7126 */
7127static int
7128decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007129 wchar_t **buf,
7130 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007131 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007132 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007133{
7134 const char *startin = in;
7135 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007136 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007137 /* Ideally, we should get reason from FormatMessage. This is the Windows
7138 2000 English version of the message. */
7139 const char *reason = "No mapping for the Unicode character exists "
7140 "in the target code page.";
7141 /* each step cannot decode more than 1 character, but a character can be
7142 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007143 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007144 int insize;
7145 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007146 PyObject *errorHandler = NULL;
7147 PyObject *exc = NULL;
7148 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007149 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007150 DWORD err;
7151 int ret = -1;
7152
7153 assert(size > 0);
7154
7155 encoding = code_page_name(code_page, &encoding_obj);
7156 if (encoding == NULL)
7157 return -1;
7158
Victor Stinner7d00cc12014-03-17 23:08:06 +01007159 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007160 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7161 UnicodeDecodeError. */
7162 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7163 if (exc != NULL) {
7164 PyCodec_StrictErrors(exc);
7165 Py_CLEAR(exc);
7166 }
7167 goto error;
7168 }
7169
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007170 /* Extend a wchar_t* buffer */
7171 Py_ssize_t n = *bufsize; /* Get the current length */
7172 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7173 PyErr_NoMemory();
7174 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007175 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007176 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7177 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007178 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007179 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007180
7181 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007182 while (in < endin)
7183 {
7184 /* Decode a character */
7185 insize = 1;
7186 do
7187 {
7188 outsize = MultiByteToWideChar(code_page, flags,
7189 in, insize,
7190 buffer, Py_ARRAY_LENGTH(buffer));
7191 if (outsize > 0)
7192 break;
7193 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007194 if (err == ERROR_INVALID_FLAGS && flags) {
7195 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7196 flags = 0;
7197 continue;
7198 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007199 if (err != ERROR_NO_UNICODE_TRANSLATION
7200 && err != ERROR_INSUFFICIENT_BUFFER)
7201 {
7202 PyErr_SetFromWindowsErr(0);
7203 goto error;
7204 }
7205 insize++;
7206 }
7207 /* 4=maximum length of a UTF-8 sequence */
7208 while (insize <= 4 && (in + insize) <= endin);
7209
7210 if (outsize <= 0) {
7211 Py_ssize_t startinpos, endinpos, outpos;
7212
Victor Stinner7d00cc12014-03-17 23:08:06 +01007213 /* last character in partial decode? */
7214 if (in + insize >= endin && !final)
7215 break;
7216
Victor Stinner3a50e702011-10-18 21:21:00 +02007217 startinpos = in - startin;
7218 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007219 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007220 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007221 errors, &errorHandler,
7222 encoding, reason,
7223 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007224 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007225 {
7226 goto error;
7227 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007228 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007229 }
7230 else {
7231 in += insize;
7232 memcpy(out, buffer, outsize * sizeof(wchar_t));
7233 out += outsize;
7234 }
7235 }
7236
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007237 /* Shrink the buffer */
7238 assert(out - *buf <= *bufsize);
7239 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007240 /* (in - startin) <= size and size is an int */
7241 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007242
7243error:
7244 Py_XDECREF(encoding_obj);
7245 Py_XDECREF(errorHandler);
7246 Py_XDECREF(exc);
7247 return ret;
7248}
7249
Victor Stinner3a50e702011-10-18 21:21:00 +02007250static PyObject *
7251decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007252 const char *s, Py_ssize_t size,
7253 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007254{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007255 wchar_t *buf = NULL;
7256 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007257 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007258
Victor Stinner3a50e702011-10-18 21:21:00 +02007259 if (code_page < 0) {
7260 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7261 return NULL;
7262 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007263 if (size < 0) {
7264 PyErr_BadInternalCall();
7265 return NULL;
7266 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007267
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007268 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007269 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007270
Victor Stinner76a31a62011-11-04 00:05:13 +01007271 do
7272 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007273#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007274 if (size > INT_MAX) {
7275 chunk_size = INT_MAX;
7276 final = 0;
7277 done = 0;
7278 }
7279 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007280#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007281 {
7282 chunk_size = (int)size;
7283 final = (consumed == NULL);
7284 done = 1;
7285 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007286
Victor Stinner76a31a62011-11-04 00:05:13 +01007287 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007288 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007289 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007290 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007291 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007292
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007293 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007294 s, chunk_size);
7295 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007296 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007297 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007298 errors, final);
7299 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007300
7301 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007302 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007303 return NULL;
7304 }
7305
7306 if (consumed)
7307 *consumed += converted;
7308
7309 s += converted;
7310 size -= converted;
7311 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007312
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007313 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7314 PyMem_Free(buf);
7315 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007316}
7317
Alexander Belopolsky40018472011-02-26 01:02:56 +00007318PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007319PyUnicode_DecodeCodePageStateful(int code_page,
7320 const char *s,
7321 Py_ssize_t size,
7322 const char *errors,
7323 Py_ssize_t *consumed)
7324{
7325 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7326}
7327
7328PyObject *
7329PyUnicode_DecodeMBCSStateful(const char *s,
7330 Py_ssize_t size,
7331 const char *errors,
7332 Py_ssize_t *consumed)
7333{
7334 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7335}
7336
7337PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007338PyUnicode_DecodeMBCS(const char *s,
7339 Py_ssize_t size,
7340 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007341{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007342 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7343}
7344
Victor Stinner3a50e702011-10-18 21:21:00 +02007345static DWORD
7346encode_code_page_flags(UINT code_page, const char *errors)
7347{
7348 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007349 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007350 }
7351 else if (code_page == CP_UTF7) {
7352 /* CP_UTF7 only supports flags=0 */
7353 return 0;
7354 }
7355 else {
7356 if (errors != NULL && strcmp(errors, "replace") == 0)
7357 return 0;
7358 else
7359 return WC_NO_BEST_FIT_CHARS;
7360 }
7361}
7362
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007363/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007364 * Encode a Unicode string to a Windows code page into a byte string in strict
7365 * mode.
7366 *
7367 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007368 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007369 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007370static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007371encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007372 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007373 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007374{
Victor Stinner554f3f02010-06-16 23:33:54 +00007375 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007376 BOOL *pusedDefaultChar = &usedDefaultChar;
7377 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007378 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007379 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007380 const DWORD flags = encode_code_page_flags(code_page, NULL);
7381 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007382 /* Create a substring so that we can get the UTF-16 representation
7383 of just the slice under consideration. */
7384 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007385
Martin v. Löwis3d325192011-11-04 18:23:06 +01007386 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007387
Victor Stinner3a50e702011-10-18 21:21:00 +02007388 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007389 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007390 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007391 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007392
Victor Stinner2fc507f2011-11-04 20:06:39 +01007393 substring = PyUnicode_Substring(unicode, offset, offset+len);
7394 if (substring == NULL)
7395 return -1;
7396 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7397 if (p == NULL) {
7398 Py_DECREF(substring);
7399 return -1;
7400 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007401 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007402
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007403 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007404 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007405 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007406 NULL, 0,
7407 NULL, pusedDefaultChar);
7408 if (outsize <= 0)
7409 goto error;
7410 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007411 if (pusedDefaultChar && *pusedDefaultChar) {
7412 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007413 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007414 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007415
Victor Stinner3a50e702011-10-18 21:21:00 +02007416 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007417 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007418 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007419 if (*outbytes == NULL) {
7420 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007421 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007422 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007423 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007424 }
7425 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007426 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007427 const Py_ssize_t n = PyBytes_Size(*outbytes);
7428 if (outsize > PY_SSIZE_T_MAX - n) {
7429 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007430 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007431 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007432 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007433 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7434 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007435 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007436 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007437 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007438 }
7439
7440 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007441 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007442 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007443 out, outsize,
7444 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007445 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007446 if (outsize <= 0)
7447 goto error;
7448 if (pusedDefaultChar && *pusedDefaultChar)
7449 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007450 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007451
Victor Stinner3a50e702011-10-18 21:21:00 +02007452error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007453 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007454 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7455 return -2;
7456 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007457 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007458}
7459
Victor Stinner3a50e702011-10-18 21:21:00 +02007460/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007461 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007462 * error handler.
7463 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007464 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007465 * -1 on other error.
7466 */
7467static int
7468encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007469 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007470 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007471{
Victor Stinner3a50e702011-10-18 21:21:00 +02007472 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007473 Py_ssize_t pos = unicode_offset;
7474 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007475 /* Ideally, we should get reason from FormatMessage. This is the Windows
7476 2000 English version of the message. */
7477 const char *reason = "invalid character";
7478 /* 4=maximum length of a UTF-8 sequence */
7479 char buffer[4];
7480 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7481 Py_ssize_t outsize;
7482 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007483 PyObject *errorHandler = NULL;
7484 PyObject *exc = NULL;
7485 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007486 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007487 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007488 PyObject *rep;
7489 int ret = -1;
7490
7491 assert(insize > 0);
7492
7493 encoding = code_page_name(code_page, &encoding_obj);
7494 if (encoding == NULL)
7495 return -1;
7496
7497 if (errors == NULL || strcmp(errors, "strict") == 0) {
7498 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7499 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007500 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007501 if (exc != NULL) {
7502 PyCodec_StrictErrors(exc);
7503 Py_DECREF(exc);
7504 }
7505 Py_XDECREF(encoding_obj);
7506 return -1;
7507 }
7508
7509 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7510 pusedDefaultChar = &usedDefaultChar;
7511 else
7512 pusedDefaultChar = NULL;
7513
7514 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7515 PyErr_NoMemory();
7516 goto error;
7517 }
7518 outsize = insize * Py_ARRAY_LENGTH(buffer);
7519
7520 if (*outbytes == NULL) {
7521 /* Create string object */
7522 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7523 if (*outbytes == NULL)
7524 goto error;
7525 out = PyBytes_AS_STRING(*outbytes);
7526 }
7527 else {
7528 /* Extend string object */
7529 Py_ssize_t n = PyBytes_Size(*outbytes);
7530 if (n > PY_SSIZE_T_MAX - outsize) {
7531 PyErr_NoMemory();
7532 goto error;
7533 }
7534 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7535 goto error;
7536 out = PyBytes_AS_STRING(*outbytes) + n;
7537 }
7538
7539 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007540 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007541 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007542 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7543 wchar_t chars[2];
7544 int charsize;
7545 if (ch < 0x10000) {
7546 chars[0] = (wchar_t)ch;
7547 charsize = 1;
7548 }
7549 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007550 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7551 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007552 charsize = 2;
7553 }
7554
Victor Stinner3a50e702011-10-18 21:21:00 +02007555 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007556 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007557 buffer, Py_ARRAY_LENGTH(buffer),
7558 NULL, pusedDefaultChar);
7559 if (outsize > 0) {
7560 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7561 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007562 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007563 memcpy(out, buffer, outsize);
7564 out += outsize;
7565 continue;
7566 }
7567 }
7568 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7569 PyErr_SetFromWindowsErr(0);
7570 goto error;
7571 }
7572
Victor Stinner3a50e702011-10-18 21:21:00 +02007573 rep = unicode_encode_call_errorhandler(
7574 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007575 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007576 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007577 if (rep == NULL)
7578 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007579 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007580
7581 if (PyBytes_Check(rep)) {
7582 outsize = PyBytes_GET_SIZE(rep);
7583 if (outsize != 1) {
7584 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7585 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7586 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7587 Py_DECREF(rep);
7588 goto error;
7589 }
7590 out = PyBytes_AS_STRING(*outbytes) + offset;
7591 }
7592 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7593 out += outsize;
7594 }
7595 else {
7596 Py_ssize_t i;
7597 enum PyUnicode_Kind kind;
7598 void *data;
7599
Benjamin Petersonbac79492012-01-14 13:34:47 -05007600 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007601 Py_DECREF(rep);
7602 goto error;
7603 }
7604
7605 outsize = PyUnicode_GET_LENGTH(rep);
7606 if (outsize != 1) {
7607 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7608 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7609 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7610 Py_DECREF(rep);
7611 goto error;
7612 }
7613 out = PyBytes_AS_STRING(*outbytes) + offset;
7614 }
7615 kind = PyUnicode_KIND(rep);
7616 data = PyUnicode_DATA(rep);
7617 for (i=0; i < outsize; i++) {
7618 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7619 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007620 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007621 encoding, unicode,
7622 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007623 "unable to encode error handler result to ASCII");
7624 Py_DECREF(rep);
7625 goto error;
7626 }
7627 *out = (unsigned char)ch;
7628 out++;
7629 }
7630 }
7631 Py_DECREF(rep);
7632 }
7633 /* write a NUL byte */
7634 *out = 0;
7635 outsize = out - PyBytes_AS_STRING(*outbytes);
7636 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7637 if (_PyBytes_Resize(outbytes, outsize) < 0)
7638 goto error;
7639 ret = 0;
7640
7641error:
7642 Py_XDECREF(encoding_obj);
7643 Py_XDECREF(errorHandler);
7644 Py_XDECREF(exc);
7645 return ret;
7646}
7647
Victor Stinner3a50e702011-10-18 21:21:00 +02007648static PyObject *
7649encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007650 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007651 const char *errors)
7652{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007653 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007654 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007655 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007656 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007657
Victor Stinner29dacf22015-01-26 16:41:32 +01007658 if (!PyUnicode_Check(unicode)) {
7659 PyErr_BadArgument();
7660 return NULL;
7661 }
7662
Benjamin Petersonbac79492012-01-14 13:34:47 -05007663 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007664 return NULL;
7665 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007666
Victor Stinner3a50e702011-10-18 21:21:00 +02007667 if (code_page < 0) {
7668 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7669 return NULL;
7670 }
7671
Martin v. Löwis3d325192011-11-04 18:23:06 +01007672 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007673 return PyBytes_FromStringAndSize(NULL, 0);
7674
Victor Stinner7581cef2011-11-03 22:32:33 +01007675 offset = 0;
7676 do
7677 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007678#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007679 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007680 chunks. */
7681 if (len > INT_MAX/2) {
7682 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007683 done = 0;
7684 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007685 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007686#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007687 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007688 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007689 done = 1;
7690 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007691
Victor Stinner76a31a62011-11-04 00:05:13 +01007692 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007693 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007694 errors);
7695 if (ret == -2)
7696 ret = encode_code_page_errors(code_page, &outbytes,
7697 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007698 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007699 if (ret < 0) {
7700 Py_XDECREF(outbytes);
7701 return NULL;
7702 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007703
Victor Stinner7581cef2011-11-03 22:32:33 +01007704 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007705 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007706 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007707
Victor Stinner3a50e702011-10-18 21:21:00 +02007708 return outbytes;
7709}
7710
7711PyObject *
7712PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7713 Py_ssize_t size,
7714 const char *errors)
7715{
Victor Stinner7581cef2011-11-03 22:32:33 +01007716 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007717 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007718 if (unicode == NULL)
7719 return NULL;
7720 res = encode_code_page(CP_ACP, unicode, errors);
7721 Py_DECREF(unicode);
7722 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007723}
7724
7725PyObject *
7726PyUnicode_EncodeCodePage(int code_page,
7727 PyObject *unicode,
7728 const char *errors)
7729{
Victor Stinner7581cef2011-11-03 22:32:33 +01007730 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007731}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007732
Alexander Belopolsky40018472011-02-26 01:02:56 +00007733PyObject *
7734PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007735{
Victor Stinner7581cef2011-11-03 22:32:33 +01007736 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007737}
7738
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007739#undef NEED_RETRY
7740
Steve Dowercc16be82016-09-08 10:35:16 -07007741#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007742
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743/* --- Character Mapping Codec -------------------------------------------- */
7744
Victor Stinnerfb161b12013-04-18 01:44:27 +02007745static int
7746charmap_decode_string(const char *s,
7747 Py_ssize_t size,
7748 PyObject *mapping,
7749 const char *errors,
7750 _PyUnicodeWriter *writer)
7751{
7752 const char *starts = s;
7753 const char *e;
7754 Py_ssize_t startinpos, endinpos;
7755 PyObject *errorHandler = NULL, *exc = NULL;
7756 Py_ssize_t maplen;
7757 enum PyUnicode_Kind mapkind;
7758 void *mapdata;
7759 Py_UCS4 x;
7760 unsigned char ch;
7761
7762 if (PyUnicode_READY(mapping) == -1)
7763 return -1;
7764
7765 maplen = PyUnicode_GET_LENGTH(mapping);
7766 mapdata = PyUnicode_DATA(mapping);
7767 mapkind = PyUnicode_KIND(mapping);
7768
7769 e = s + size;
7770
7771 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7772 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7773 * is disabled in encoding aliases, latin1 is preferred because
7774 * its implementation is faster. */
7775 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7776 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7777 Py_UCS4 maxchar = writer->maxchar;
7778
7779 assert (writer->kind == PyUnicode_1BYTE_KIND);
7780 while (s < e) {
7781 ch = *s;
7782 x = mapdata_ucs1[ch];
7783 if (x > maxchar) {
7784 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7785 goto onError;
7786 maxchar = writer->maxchar;
7787 outdata = (Py_UCS1 *)writer->data;
7788 }
7789 outdata[writer->pos] = x;
7790 writer->pos++;
7791 ++s;
7792 }
7793 return 0;
7794 }
7795
7796 while (s < e) {
7797 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7798 enum PyUnicode_Kind outkind = writer->kind;
7799 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7800 if (outkind == PyUnicode_1BYTE_KIND) {
7801 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7802 Py_UCS4 maxchar = writer->maxchar;
7803 while (s < e) {
7804 ch = *s;
7805 x = mapdata_ucs2[ch];
7806 if (x > maxchar)
7807 goto Error;
7808 outdata[writer->pos] = x;
7809 writer->pos++;
7810 ++s;
7811 }
7812 break;
7813 }
7814 else if (outkind == PyUnicode_2BYTE_KIND) {
7815 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7816 while (s < e) {
7817 ch = *s;
7818 x = mapdata_ucs2[ch];
7819 if (x == 0xFFFE)
7820 goto Error;
7821 outdata[writer->pos] = x;
7822 writer->pos++;
7823 ++s;
7824 }
7825 break;
7826 }
7827 }
7828 ch = *s;
7829
7830 if (ch < maplen)
7831 x = PyUnicode_READ(mapkind, mapdata, ch);
7832 else
7833 x = 0xfffe; /* invalid value */
7834Error:
7835 if (x == 0xfffe)
7836 {
7837 /* undefined mapping */
7838 startinpos = s-starts;
7839 endinpos = startinpos+1;
7840 if (unicode_decode_call_errorhandler_writer(
7841 errors, &errorHandler,
7842 "charmap", "character maps to <undefined>",
7843 &starts, &e, &startinpos, &endinpos, &exc, &s,
7844 writer)) {
7845 goto onError;
7846 }
7847 continue;
7848 }
7849
7850 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7851 goto onError;
7852 ++s;
7853 }
7854 Py_XDECREF(errorHandler);
7855 Py_XDECREF(exc);
7856 return 0;
7857
7858onError:
7859 Py_XDECREF(errorHandler);
7860 Py_XDECREF(exc);
7861 return -1;
7862}
7863
7864static int
7865charmap_decode_mapping(const char *s,
7866 Py_ssize_t size,
7867 PyObject *mapping,
7868 const char *errors,
7869 _PyUnicodeWriter *writer)
7870{
7871 const char *starts = s;
7872 const char *e;
7873 Py_ssize_t startinpos, endinpos;
7874 PyObject *errorHandler = NULL, *exc = NULL;
7875 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007876 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007877
7878 e = s + size;
7879
7880 while (s < e) {
7881 ch = *s;
7882
7883 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7884 key = PyLong_FromLong((long)ch);
7885 if (key == NULL)
7886 goto onError;
7887
7888 item = PyObject_GetItem(mapping, key);
7889 Py_DECREF(key);
7890 if (item == NULL) {
7891 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7892 /* No mapping found means: mapping is undefined. */
7893 PyErr_Clear();
7894 goto Undefined;
7895 } else
7896 goto onError;
7897 }
7898
7899 /* Apply mapping */
7900 if (item == Py_None)
7901 goto Undefined;
7902 if (PyLong_Check(item)) {
7903 long value = PyLong_AS_LONG(item);
7904 if (value == 0xFFFE)
7905 goto Undefined;
7906 if (value < 0 || value > MAX_UNICODE) {
7907 PyErr_Format(PyExc_TypeError,
7908 "character mapping must be in range(0x%lx)",
7909 (unsigned long)MAX_UNICODE + 1);
7910 goto onError;
7911 }
7912
7913 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7914 goto onError;
7915 }
7916 else if (PyUnicode_Check(item)) {
7917 if (PyUnicode_READY(item) == -1)
7918 goto onError;
7919 if (PyUnicode_GET_LENGTH(item) == 1) {
7920 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7921 if (value == 0xFFFE)
7922 goto Undefined;
7923 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7924 goto onError;
7925 }
7926 else {
7927 writer->overallocate = 1;
7928 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7929 goto onError;
7930 }
7931 }
7932 else {
7933 /* wrong return value */
7934 PyErr_SetString(PyExc_TypeError,
7935 "character mapping must return integer, None or str");
7936 goto onError;
7937 }
7938 Py_CLEAR(item);
7939 ++s;
7940 continue;
7941
7942Undefined:
7943 /* undefined mapping */
7944 Py_CLEAR(item);
7945 startinpos = s-starts;
7946 endinpos = startinpos+1;
7947 if (unicode_decode_call_errorhandler_writer(
7948 errors, &errorHandler,
7949 "charmap", "character maps to <undefined>",
7950 &starts, &e, &startinpos, &endinpos, &exc, &s,
7951 writer)) {
7952 goto onError;
7953 }
7954 }
7955 Py_XDECREF(errorHandler);
7956 Py_XDECREF(exc);
7957 return 0;
7958
7959onError:
7960 Py_XDECREF(item);
7961 Py_XDECREF(errorHandler);
7962 Py_XDECREF(exc);
7963 return -1;
7964}
7965
Alexander Belopolsky40018472011-02-26 01:02:56 +00007966PyObject *
7967PyUnicode_DecodeCharmap(const char *s,
7968 Py_ssize_t size,
7969 PyObject *mapping,
7970 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007971{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007972 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007973
Guido van Rossumd57fd912000-03-10 22:53:23 +00007974 /* Default to Latin-1 */
7975 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007976 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007977
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007979 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007980 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007981 writer.min_length = size;
7982 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007984
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007985 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007986 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7987 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007988 }
7989 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007990 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7991 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007993 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007994
Benjamin Peterson29060642009-01-31 22:14:21 +00007995 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007996 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997 return NULL;
7998}
7999
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008000/* Charmap encoding: the lookup table */
8001
Alexander Belopolsky40018472011-02-26 01:02:56 +00008002struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008003 PyObject_HEAD
8004 unsigned char level1[32];
8005 int count2, count3;
8006 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008007};
8008
8009static PyObject*
8010encoding_map_size(PyObject *obj, PyObject* args)
8011{
8012 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008013 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008014 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008015}
8016
8017static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008018 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008019 PyDoc_STR("Return the size (in bytes) of this object") },
8020 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008021};
8022
8023static void
8024encoding_map_dealloc(PyObject* o)
8025{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008026 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008027}
8028
8029static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008030 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008031 "EncodingMap", /*tp_name*/
8032 sizeof(struct encoding_map), /*tp_basicsize*/
8033 0, /*tp_itemsize*/
8034 /* methods */
8035 encoding_map_dealloc, /*tp_dealloc*/
8036 0, /*tp_print*/
8037 0, /*tp_getattr*/
8038 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008039 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008040 0, /*tp_repr*/
8041 0, /*tp_as_number*/
8042 0, /*tp_as_sequence*/
8043 0, /*tp_as_mapping*/
8044 0, /*tp_hash*/
8045 0, /*tp_call*/
8046 0, /*tp_str*/
8047 0, /*tp_getattro*/
8048 0, /*tp_setattro*/
8049 0, /*tp_as_buffer*/
8050 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8051 0, /*tp_doc*/
8052 0, /*tp_traverse*/
8053 0, /*tp_clear*/
8054 0, /*tp_richcompare*/
8055 0, /*tp_weaklistoffset*/
8056 0, /*tp_iter*/
8057 0, /*tp_iternext*/
8058 encoding_map_methods, /*tp_methods*/
8059 0, /*tp_members*/
8060 0, /*tp_getset*/
8061 0, /*tp_base*/
8062 0, /*tp_dict*/
8063 0, /*tp_descr_get*/
8064 0, /*tp_descr_set*/
8065 0, /*tp_dictoffset*/
8066 0, /*tp_init*/
8067 0, /*tp_alloc*/
8068 0, /*tp_new*/
8069 0, /*tp_free*/
8070 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008071};
8072
8073PyObject*
8074PyUnicode_BuildEncodingMap(PyObject* string)
8075{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008076 PyObject *result;
8077 struct encoding_map *mresult;
8078 int i;
8079 int need_dict = 0;
8080 unsigned char level1[32];
8081 unsigned char level2[512];
8082 unsigned char *mlevel1, *mlevel2, *mlevel3;
8083 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008084 int kind;
8085 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008086 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008087 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008088
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008089 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008090 PyErr_BadArgument();
8091 return NULL;
8092 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008093 kind = PyUnicode_KIND(string);
8094 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008095 length = PyUnicode_GET_LENGTH(string);
8096 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008097 memset(level1, 0xFF, sizeof level1);
8098 memset(level2, 0xFF, sizeof level2);
8099
8100 /* If there isn't a one-to-one mapping of NULL to \0,
8101 or if there are non-BMP characters, we need to use
8102 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008103 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008104 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008105 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008106 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008107 ch = PyUnicode_READ(kind, data, i);
8108 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008109 need_dict = 1;
8110 break;
8111 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008112 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008113 /* unmapped character */
8114 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008115 l1 = ch >> 11;
8116 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008117 if (level1[l1] == 0xFF)
8118 level1[l1] = count2++;
8119 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008120 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008121 }
8122
8123 if (count2 >= 0xFF || count3 >= 0xFF)
8124 need_dict = 1;
8125
8126 if (need_dict) {
8127 PyObject *result = PyDict_New();
8128 PyObject *key, *value;
8129 if (!result)
8130 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008131 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008132 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008133 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008134 if (!key || !value)
8135 goto failed1;
8136 if (PyDict_SetItem(result, key, value) == -1)
8137 goto failed1;
8138 Py_DECREF(key);
8139 Py_DECREF(value);
8140 }
8141 return result;
8142 failed1:
8143 Py_XDECREF(key);
8144 Py_XDECREF(value);
8145 Py_DECREF(result);
8146 return NULL;
8147 }
8148
8149 /* Create a three-level trie */
8150 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8151 16*count2 + 128*count3 - 1);
8152 if (!result)
8153 return PyErr_NoMemory();
8154 PyObject_Init(result, &EncodingMapType);
8155 mresult = (struct encoding_map*)result;
8156 mresult->count2 = count2;
8157 mresult->count3 = count3;
8158 mlevel1 = mresult->level1;
8159 mlevel2 = mresult->level23;
8160 mlevel3 = mresult->level23 + 16*count2;
8161 memcpy(mlevel1, level1, 32);
8162 memset(mlevel2, 0xFF, 16*count2);
8163 memset(mlevel3, 0, 128*count3);
8164 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008165 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008166 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008167 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8168 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008169 /* unmapped character */
8170 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008171 o1 = ch>>11;
8172 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008173 i2 = 16*mlevel1[o1] + o2;
8174 if (mlevel2[i2] == 0xFF)
8175 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008176 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008177 i3 = 128*mlevel2[i2] + o3;
8178 mlevel3[i3] = i;
8179 }
8180 return result;
8181}
8182
8183static int
Victor Stinner22168992011-11-20 17:09:18 +01008184encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008185{
8186 struct encoding_map *map = (struct encoding_map*)mapping;
8187 int l1 = c>>11;
8188 int l2 = (c>>7) & 0xF;
8189 int l3 = c & 0x7F;
8190 int i;
8191
Victor Stinner22168992011-11-20 17:09:18 +01008192 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008193 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008194 if (c == 0)
8195 return 0;
8196 /* level 1*/
8197 i = map->level1[l1];
8198 if (i == 0xFF) {
8199 return -1;
8200 }
8201 /* level 2*/
8202 i = map->level23[16*i+l2];
8203 if (i == 0xFF) {
8204 return -1;
8205 }
8206 /* level 3 */
8207 i = map->level23[16*map->count2 + 128*i + l3];
8208 if (i == 0) {
8209 return -1;
8210 }
8211 return i;
8212}
8213
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008214/* Lookup the character ch in the mapping. If the character
8215 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008216 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008217static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008218charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008219{
Christian Heimes217cfd12007-12-02 14:31:20 +00008220 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008221 PyObject *x;
8222
8223 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008224 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008225 x = PyObject_GetItem(mapping, w);
8226 Py_DECREF(w);
8227 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008228 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8229 /* No mapping found means: mapping is undefined. */
8230 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008231 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008232 } else
8233 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008234 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008235 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008236 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008237 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008238 long value = PyLong_AS_LONG(x);
8239 if (value < 0 || value > 255) {
8240 PyErr_SetString(PyExc_TypeError,
8241 "character mapping must be in range(256)");
8242 Py_DECREF(x);
8243 return NULL;
8244 }
8245 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008247 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 /* wrong return value */
8251 PyErr_Format(PyExc_TypeError,
8252 "character mapping must return integer, bytes or None, not %.400s",
8253 x->ob_type->tp_name);
8254 Py_DECREF(x);
8255 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008256 }
8257}
8258
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008259static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008260charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008261{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008262 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8263 /* exponentially overallocate to minimize reallocations */
8264 if (requiredsize < 2*outsize)
8265 requiredsize = 2*outsize;
8266 if (_PyBytes_Resize(outobj, requiredsize))
8267 return -1;
8268 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008269}
8270
Benjamin Peterson14339b62009-01-31 16:36:08 +00008271typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008273} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008274/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008275 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008276 space is available. Return a new reference to the object that
8277 was put in the output buffer, or Py_None, if the mapping was undefined
8278 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008279 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008280static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008281charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008282 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008283{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008284 PyObject *rep;
8285 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008286 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008287
Christian Heimes90aa7642007-12-19 02:45:37 +00008288 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008289 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008290 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008291 if (res == -1)
8292 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008293 if (outsize<requiredsize)
8294 if (charmapencode_resize(outobj, outpos, requiredsize))
8295 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008296 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008297 outstart[(*outpos)++] = (char)res;
8298 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008299 }
8300
8301 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008302 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008304 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008305 Py_DECREF(rep);
8306 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008307 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008308 if (PyLong_Check(rep)) {
8309 Py_ssize_t requiredsize = *outpos+1;
8310 if (outsize<requiredsize)
8311 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8312 Py_DECREF(rep);
8313 return enc_EXCEPTION;
8314 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008315 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008316 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008317 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008318 else {
8319 const char *repchars = PyBytes_AS_STRING(rep);
8320 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8321 Py_ssize_t requiredsize = *outpos+repsize;
8322 if (outsize<requiredsize)
8323 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8324 Py_DECREF(rep);
8325 return enc_EXCEPTION;
8326 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008327 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008328 memcpy(outstart + *outpos, repchars, repsize);
8329 *outpos += repsize;
8330 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008331 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008332 Py_DECREF(rep);
8333 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008334}
8335
8336/* handle an error in PyUnicode_EncodeCharmap
8337 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008338static int
8339charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008340 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008341 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008342 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008343 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008344{
8345 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008346 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008347 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008348 enum PyUnicode_Kind kind;
8349 void *data;
8350 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008351 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008352 Py_ssize_t collstartpos = *inpos;
8353 Py_ssize_t collendpos = *inpos+1;
8354 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008355 const char *encoding = "charmap";
8356 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008357 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008358 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008359 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008360
Benjamin Petersonbac79492012-01-14 13:34:47 -05008361 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008362 return -1;
8363 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008364 /* find all unencodable characters */
8365 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008366 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008367 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008368 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008369 val = encoding_map_lookup(ch, mapping);
8370 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 break;
8372 ++collendpos;
8373 continue;
8374 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008375
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008376 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8377 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 if (rep==NULL)
8379 return -1;
8380 else if (rep!=Py_None) {
8381 Py_DECREF(rep);
8382 break;
8383 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008384 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008386 }
8387 /* cache callback name lookup
8388 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008389 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008390 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008391
8392 switch (*error_handler) {
8393 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008394 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008395 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008396
8397 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008398 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008399 x = charmapencode_output('?', mapping, res, respos);
8400 if (x==enc_EXCEPTION) {
8401 return -1;
8402 }
8403 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008404 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008405 return -1;
8406 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008407 }
8408 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008409 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008410 *inpos = collendpos;
8411 break;
Victor Stinner50149202015-09-22 00:26:54 +02008412
8413 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008414 /* generate replacement (temporarily (mis)uses p) */
8415 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008416 char buffer[2+29+1+1];
8417 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008418 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008419 for (cp = buffer; *cp; ++cp) {
8420 x = charmapencode_output(*cp, mapping, res, respos);
8421 if (x==enc_EXCEPTION)
8422 return -1;
8423 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008424 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008425 return -1;
8426 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008427 }
8428 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008429 *inpos = collendpos;
8430 break;
Victor Stinner50149202015-09-22 00:26:54 +02008431
Benjamin Peterson14339b62009-01-31 16:36:08 +00008432 default:
Victor Stinner50149202015-09-22 00:26:54 +02008433 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008434 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008435 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008436 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008437 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008438 if (PyBytes_Check(repunicode)) {
8439 /* Directly copy bytes result to output. */
8440 Py_ssize_t outsize = PyBytes_Size(*res);
8441 Py_ssize_t requiredsize;
8442 repsize = PyBytes_Size(repunicode);
8443 requiredsize = *respos + repsize;
8444 if (requiredsize > outsize)
8445 /* Make room for all additional bytes. */
8446 if (charmapencode_resize(res, respos, requiredsize)) {
8447 Py_DECREF(repunicode);
8448 return -1;
8449 }
8450 memcpy(PyBytes_AsString(*res) + *respos,
8451 PyBytes_AsString(repunicode), repsize);
8452 *respos += repsize;
8453 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008454 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008455 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008456 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008457 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008458 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008459 Py_DECREF(repunicode);
8460 return -1;
8461 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008462 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008463 data = PyUnicode_DATA(repunicode);
8464 kind = PyUnicode_KIND(repunicode);
8465 for (index = 0; index < repsize; index++) {
8466 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8467 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008468 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008469 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008470 return -1;
8471 }
8472 else if (x==enc_FAILED) {
8473 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008474 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008475 return -1;
8476 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008477 }
8478 *inpos = newpos;
8479 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008480 }
8481 return 0;
8482}
8483
Alexander Belopolsky40018472011-02-26 01:02:56 +00008484PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008485_PyUnicode_EncodeCharmap(PyObject *unicode,
8486 PyObject *mapping,
8487 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008489 /* output object */
8490 PyObject *res = NULL;
8491 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008492 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008493 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008494 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008495 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008496 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008497 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008498 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008499 void *data;
8500 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008501
Benjamin Petersonbac79492012-01-14 13:34:47 -05008502 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008503 return NULL;
8504 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008505 data = PyUnicode_DATA(unicode);
8506 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008507
Guido van Rossumd57fd912000-03-10 22:53:23 +00008508 /* Default to Latin-1 */
8509 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008510 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008511
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008512 /* allocate enough for a simple encoding without
8513 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008514 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008515 if (res == NULL)
8516 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008517 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008519
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008520 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008521 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008522 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008523 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 if (x==enc_EXCEPTION) /* error */
8525 goto onError;
8526 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008527 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008528 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008529 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008530 &res, &respos)) {
8531 goto onError;
8532 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008533 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008534 else
8535 /* done with this character => adjust input position */
8536 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008538
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008539 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008540 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008541 if (_PyBytes_Resize(&res, respos) < 0)
8542 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008543
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008544 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008545 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008546 return res;
8547
Benjamin Peterson29060642009-01-31 22:14:21 +00008548 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008549 Py_XDECREF(res);
8550 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008551 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552 return NULL;
8553}
8554
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008555/* Deprecated */
8556PyObject *
8557PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8558 Py_ssize_t size,
8559 PyObject *mapping,
8560 const char *errors)
8561{
8562 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008563 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008564 if (unicode == NULL)
8565 return NULL;
8566 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8567 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008568 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008569}
8570
Alexander Belopolsky40018472011-02-26 01:02:56 +00008571PyObject *
8572PyUnicode_AsCharmapString(PyObject *unicode,
8573 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574{
8575 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008576 PyErr_BadArgument();
8577 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008579 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580}
8581
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008582/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008583static void
8584make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008585 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008586 Py_ssize_t startpos, Py_ssize_t endpos,
8587 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008589 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008590 *exceptionObject = _PyUnicodeTranslateError_Create(
8591 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592 }
8593 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008594 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8595 goto onError;
8596 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8597 goto onError;
8598 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8599 goto onError;
8600 return;
8601 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008602 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008603 }
8604}
8605
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008606/* error handling callback helper:
8607 build arguments, call the callback and check the arguments,
8608 put the result into newpos and return the replacement string, which
8609 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008610static PyObject *
8611unicode_translate_call_errorhandler(const char *errors,
8612 PyObject **errorHandler,
8613 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008614 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008615 Py_ssize_t startpos, Py_ssize_t endpos,
8616 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008617{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008618 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008619
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008620 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008621 PyObject *restuple;
8622 PyObject *resunicode;
8623
8624 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008625 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008626 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008627 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008628 }
8629
8630 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008631 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008632 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008633 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008634
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008635 restuple = PyObject_CallFunctionObjArgs(
8636 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008637 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008638 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008639 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008640 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008641 Py_DECREF(restuple);
8642 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008643 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008644 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008645 &resunicode, &i_newpos)) {
8646 Py_DECREF(restuple);
8647 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008648 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008649 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008650 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008651 else
8652 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008653 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008654 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008655 Py_DECREF(restuple);
8656 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008657 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008658 Py_INCREF(resunicode);
8659 Py_DECREF(restuple);
8660 return resunicode;
8661}
8662
8663/* Lookup the character ch in the mapping and put the result in result,
8664 which must be decrefed by the caller.
8665 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008666static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008667charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008668{
Christian Heimes217cfd12007-12-02 14:31:20 +00008669 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008670 PyObject *x;
8671
8672 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008674 x = PyObject_GetItem(mapping, w);
8675 Py_DECREF(w);
8676 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8678 /* No mapping found means: use 1:1 mapping. */
8679 PyErr_Clear();
8680 *result = NULL;
8681 return 0;
8682 } else
8683 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008684 }
8685 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 *result = x;
8687 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008688 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008689 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008690 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008691 if (value < 0 || value > MAX_UNICODE) {
8692 PyErr_Format(PyExc_ValueError,
8693 "character mapping must be in range(0x%x)",
8694 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008695 Py_DECREF(x);
8696 return -1;
8697 }
8698 *result = x;
8699 return 0;
8700 }
8701 else if (PyUnicode_Check(x)) {
8702 *result = x;
8703 return 0;
8704 }
8705 else {
8706 /* wrong return value */
8707 PyErr_SetString(PyExc_TypeError,
8708 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008709 Py_DECREF(x);
8710 return -1;
8711 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008712}
Victor Stinner1194ea02014-04-04 19:37:40 +02008713
8714/* lookup the character, write the result into the writer.
8715 Return 1 if the result was written into the writer, return 0 if the mapping
8716 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008717static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008718charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8719 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008720{
Victor Stinner1194ea02014-04-04 19:37:40 +02008721 PyObject *item;
8722
8723 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008724 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008725
8726 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008727 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008728 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008729 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008730 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008731 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008732 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008733
8734 if (item == Py_None) {
8735 Py_DECREF(item);
8736 return 0;
8737 }
8738
8739 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008740 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8741 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8742 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008743 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8744 Py_DECREF(item);
8745 return -1;
8746 }
8747 Py_DECREF(item);
8748 return 1;
8749 }
8750
8751 if (!PyUnicode_Check(item)) {
8752 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008753 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008754 }
8755
8756 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8757 Py_DECREF(item);
8758 return -1;
8759 }
8760
8761 Py_DECREF(item);
8762 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008763}
8764
Victor Stinner89a76ab2014-04-05 11:44:04 +02008765static int
8766unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8767 Py_UCS1 *translate)
8768{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008769 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008770 int ret = 0;
8771
Victor Stinner89a76ab2014-04-05 11:44:04 +02008772 if (charmaptranslate_lookup(ch, mapping, &item)) {
8773 return -1;
8774 }
8775
8776 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008777 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008778 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008779 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008780 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008781 /* not found => default to 1:1 mapping */
8782 translate[ch] = ch;
8783 return 1;
8784 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008785 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008786 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008787 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8788 used it */
8789 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008790 /* invalid character or character outside ASCII:
8791 skip the fast translate */
8792 goto exit;
8793 }
8794 translate[ch] = (Py_UCS1)replace;
8795 }
8796 else if (PyUnicode_Check(item)) {
8797 Py_UCS4 replace;
8798
8799 if (PyUnicode_READY(item) == -1) {
8800 Py_DECREF(item);
8801 return -1;
8802 }
8803 if (PyUnicode_GET_LENGTH(item) != 1)
8804 goto exit;
8805
8806 replace = PyUnicode_READ_CHAR(item, 0);
8807 if (replace > 127)
8808 goto exit;
8809 translate[ch] = (Py_UCS1)replace;
8810 }
8811 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008812 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008813 goto exit;
8814 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008815 ret = 1;
8816
Benjamin Peterson1365de72014-04-07 20:15:41 -04008817 exit:
8818 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008819 return ret;
8820}
8821
8822/* Fast path for ascii => ascii translation. Return 1 if the whole string
8823 was translated into writer, return 0 if the input string was partially
8824 translated into writer, raise an exception and return -1 on error. */
8825static int
8826unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008827 _PyUnicodeWriter *writer, int ignore,
8828 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008829{
Victor Stinner872b2912014-04-05 14:27:07 +02008830 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008831 Py_ssize_t len;
8832 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008833 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008834
Victor Stinner89a76ab2014-04-05 11:44:04 +02008835 len = PyUnicode_GET_LENGTH(input);
8836
Victor Stinner872b2912014-04-05 14:27:07 +02008837 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008838
8839 in = PyUnicode_1BYTE_DATA(input);
8840 end = in + len;
8841
8842 assert(PyUnicode_IS_ASCII(writer->buffer));
8843 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8844 out = PyUnicode_1BYTE_DATA(writer->buffer);
8845
Victor Stinner872b2912014-04-05 14:27:07 +02008846 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008847 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008848 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008849 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008850 int translate = unicode_fast_translate_lookup(mapping, ch,
8851 ascii_table);
8852 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008853 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008854 if (translate == 0)
8855 goto exit;
8856 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008857 }
Victor Stinner872b2912014-04-05 14:27:07 +02008858 if (ch2 == 0xfe) {
8859 if (ignore)
8860 continue;
8861 goto exit;
8862 }
8863 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008864 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008865 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008866 }
Victor Stinner872b2912014-04-05 14:27:07 +02008867 res = 1;
8868
8869exit:
8870 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008871 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008872 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008873}
8874
Victor Stinner3222da22015-10-01 22:07:32 +02008875static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008876_PyUnicode_TranslateCharmap(PyObject *input,
8877 PyObject *mapping,
8878 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008879{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008880 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008881 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008882 Py_ssize_t size, i;
8883 int kind;
8884 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008885 _PyUnicodeWriter writer;
8886 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008887 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008888 PyObject *errorHandler = NULL;
8889 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008890 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008891 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008892
Guido van Rossumd57fd912000-03-10 22:53:23 +00008893 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008894 PyErr_BadArgument();
8895 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008896 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898 if (PyUnicode_READY(input) == -1)
8899 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008900 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008901 kind = PyUnicode_KIND(input);
8902 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008903
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008904 if (size == 0)
8905 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008906
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008907 /* allocate enough for a simple 1:1 translation without
8908 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008909 _PyUnicodeWriter_Init(&writer);
8910 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008911 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008912
Victor Stinner872b2912014-04-05 14:27:07 +02008913 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8914
Victor Stinner33798672016-03-01 21:59:58 +01008915 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008916 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008917 if (PyUnicode_IS_ASCII(input)) {
8918 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8919 if (res < 0) {
8920 _PyUnicodeWriter_Dealloc(&writer);
8921 return NULL;
8922 }
8923 if (res == 1)
8924 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008925 }
Victor Stinner33798672016-03-01 21:59:58 +01008926 else {
8927 i = 0;
8928 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008929
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008930 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008931 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008932 int translate;
8933 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8934 Py_ssize_t newpos;
8935 /* startpos for collecting untranslatable chars */
8936 Py_ssize_t collstart;
8937 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008938 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939
Victor Stinner1194ea02014-04-04 19:37:40 +02008940 ch = PyUnicode_READ(kind, data, i);
8941 translate = charmaptranslate_output(ch, mapping, &writer);
8942 if (translate < 0)
8943 goto onError;
8944
8945 if (translate != 0) {
8946 /* it worked => adjust input pointer */
8947 ++i;
8948 continue;
8949 }
8950
8951 /* untranslatable character */
8952 collstart = i;
8953 collend = i+1;
8954
8955 /* find all untranslatable characters */
8956 while (collend < size) {
8957 PyObject *x;
8958 ch = PyUnicode_READ(kind, data, collend);
8959 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008960 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008961 Py_XDECREF(x);
8962 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008963 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008964 ++collend;
8965 }
8966
8967 if (ignore) {
8968 i = collend;
8969 }
8970 else {
8971 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8972 reason, input, &exc,
8973 collstart, collend, &newpos);
8974 if (repunicode == NULL)
8975 goto onError;
8976 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008977 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008978 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008979 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008980 Py_DECREF(repunicode);
8981 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008982 }
8983 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008984 Py_XDECREF(exc);
8985 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008986 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008987
Benjamin Peterson29060642009-01-31 22:14:21 +00008988 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008989 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008990 Py_XDECREF(exc);
8991 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008992 return NULL;
8993}
8994
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008995/* Deprecated. Use PyUnicode_Translate instead. */
8996PyObject *
8997PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8998 Py_ssize_t size,
8999 PyObject *mapping,
9000 const char *errors)
9001{
Christian Heimes5f520f42012-09-11 14:03:25 +02009002 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009003 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009004 if (!unicode)
9005 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009006 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9007 Py_DECREF(unicode);
9008 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009009}
9010
Alexander Belopolsky40018472011-02-26 01:02:56 +00009011PyObject *
9012PyUnicode_Translate(PyObject *str,
9013 PyObject *mapping,
9014 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009016 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009017 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009018 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019}
Tim Petersced69f82003-09-16 20:30:58 +00009020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009021PyObject *
9022_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9023{
9024 if (!PyUnicode_Check(unicode)) {
9025 PyErr_BadInternalCall();
9026 return NULL;
9027 }
9028 if (PyUnicode_READY(unicode) == -1)
9029 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009030 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031 /* If the string is already ASCII, just return the same string */
9032 Py_INCREF(unicode);
9033 return unicode;
9034 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009035
9036 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9037 PyObject *result = PyUnicode_New(len, 127);
9038 if (result == NULL) {
9039 return NULL;
9040 }
9041
9042 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9043 int kind = PyUnicode_KIND(unicode);
9044 const void *data = PyUnicode_DATA(unicode);
9045 Py_ssize_t i;
9046 for (i = 0; i < len; ++i) {
9047 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9048 if (ch < 127) {
9049 out[i] = ch;
9050 }
9051 else if (Py_UNICODE_ISSPACE(ch)) {
9052 out[i] = ' ';
9053 }
9054 else {
9055 int decimal = Py_UNICODE_TODECIMAL(ch);
9056 if (decimal < 0) {
9057 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009058 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009059 _PyUnicode_LENGTH(result) = i + 1;
9060 break;
9061 }
9062 out[i] = '0' + decimal;
9063 }
9064 }
9065
INADA Naoki16dfca42018-07-14 12:06:43 +09009066 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009067 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068}
9069
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009070PyObject *
9071PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9072 Py_ssize_t length)
9073{
Victor Stinnerf0124502011-11-21 23:12:56 +01009074 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009075 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009076 Py_UCS4 maxchar;
9077 enum PyUnicode_Kind kind;
9078 void *data;
9079
Victor Stinner99d7ad02012-02-22 13:37:39 +01009080 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009081 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009082 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009083 if (ch > 127) {
9084 int decimal = Py_UNICODE_TODECIMAL(ch);
9085 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009086 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009087 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009088 }
9089 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009090
9091 /* Copy to a new string */
9092 decimal = PyUnicode_New(length, maxchar);
9093 if (decimal == NULL)
9094 return decimal;
9095 kind = PyUnicode_KIND(decimal);
9096 data = PyUnicode_DATA(decimal);
9097 /* Iterate over code points */
9098 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009099 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009100 if (ch > 127) {
9101 int decimal = Py_UNICODE_TODECIMAL(ch);
9102 if (decimal >= 0)
9103 ch = '0' + decimal;
9104 }
9105 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009106 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009107 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009108}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009109/* --- Decimal Encoder ---------------------------------------------------- */
9110
Alexander Belopolsky40018472011-02-26 01:02:56 +00009111int
9112PyUnicode_EncodeDecimal(Py_UNICODE *s,
9113 Py_ssize_t length,
9114 char *output,
9115 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009116{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009117 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009118 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009119 enum PyUnicode_Kind kind;
9120 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009121
9122 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009123 PyErr_BadArgument();
9124 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009125 }
9126
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009127 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009128 if (unicode == NULL)
9129 return -1;
9130
Victor Stinner42bf7752011-11-21 22:52:58 +01009131 kind = PyUnicode_KIND(unicode);
9132 data = PyUnicode_DATA(unicode);
9133
Victor Stinnerb84d7232011-11-22 01:50:07 +01009134 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009135 PyObject *exc;
9136 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009137 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009138 Py_ssize_t startpos;
9139
9140 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009141
Benjamin Peterson29060642009-01-31 22:14:21 +00009142 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009143 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009144 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009145 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009146 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009147 decimal = Py_UNICODE_TODECIMAL(ch);
9148 if (decimal >= 0) {
9149 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009150 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009151 continue;
9152 }
9153 if (0 < ch && ch < 256) {
9154 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009155 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009156 continue;
9157 }
Victor Stinner6345be92011-11-25 20:09:01 +01009158
Victor Stinner42bf7752011-11-21 22:52:58 +01009159 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009160 exc = NULL;
9161 raise_encode_exception(&exc, "decimal", unicode,
9162 startpos, startpos+1,
9163 "invalid decimal Unicode string");
9164 Py_XDECREF(exc);
9165 Py_DECREF(unicode);
9166 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009167 }
9168 /* 0-terminate the output string */
9169 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009170 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009171 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009172}
9173
Guido van Rossumd57fd912000-03-10 22:53:23 +00009174/* --- Helpers ------------------------------------------------------------ */
9175
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009176/* helper macro to fixup start/end slice values */
9177#define ADJUST_INDICES(start, end, len) \
9178 if (end > len) \
9179 end = len; \
9180 else if (end < 0) { \
9181 end += len; \
9182 if (end < 0) \
9183 end = 0; \
9184 } \
9185 if (start < 0) { \
9186 start += len; \
9187 if (start < 0) \
9188 start = 0; \
9189 }
9190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009191static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009192any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009193 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009194 Py_ssize_t end,
9195 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009196{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009197 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009198 void *buf1, *buf2;
9199 Py_ssize_t len1, len2, result;
9200
9201 kind1 = PyUnicode_KIND(s1);
9202 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009203 if (kind1 < kind2)
9204 return -1;
9205
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009206 len1 = PyUnicode_GET_LENGTH(s1);
9207 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009208 ADJUST_INDICES(start, end, len1);
9209 if (end - start < len2)
9210 return -1;
9211
9212 buf1 = PyUnicode_DATA(s1);
9213 buf2 = PyUnicode_DATA(s2);
9214 if (len2 == 1) {
9215 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9216 result = findchar((const char *)buf1 + kind1*start,
9217 kind1, end - start, ch, direction);
9218 if (result == -1)
9219 return -1;
9220 else
9221 return start + result;
9222 }
9223
9224 if (kind2 != kind1) {
9225 buf2 = _PyUnicode_AsKind(s2, kind1);
9226 if (!buf2)
9227 return -2;
9228 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009229
Victor Stinner794d5672011-10-10 03:21:36 +02009230 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009231 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009232 case PyUnicode_1BYTE_KIND:
9233 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9234 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9235 else
9236 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9237 break;
9238 case PyUnicode_2BYTE_KIND:
9239 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9240 break;
9241 case PyUnicode_4BYTE_KIND:
9242 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9243 break;
9244 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009245 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009246 }
9247 }
9248 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009249 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009250 case PyUnicode_1BYTE_KIND:
9251 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9252 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9253 else
9254 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9255 break;
9256 case PyUnicode_2BYTE_KIND:
9257 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9258 break;
9259 case PyUnicode_4BYTE_KIND:
9260 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9261 break;
9262 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009263 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009264 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009265 }
9266
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009267 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009268 PyMem_Free(buf2);
9269
9270 return result;
9271}
9272
Victor Stinner59423e32018-11-26 13:40:01 +01009273/* _PyUnicode_InsertThousandsGrouping() helper functions */
9274#include "stringlib/localeutil.h"
9275
9276/**
9277 * InsertThousandsGrouping:
9278 * @writer: Unicode writer.
9279 * @n_buffer: Number of characters in @buffer.
9280 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9281 * @d_pos: Start of digits string.
9282 * @n_digits: The number of digits in the string, in which we want
9283 * to put the grouping chars.
9284 * @min_width: The minimum width of the digits in the output string.
9285 * Output will be zero-padded on the left to fill.
9286 * @grouping: see definition in localeconv().
9287 * @thousands_sep: see definition in localeconv().
9288 *
9289 * There are 2 modes: counting and filling. If @writer is NULL,
9290 * we are in counting mode, else filling mode.
9291 * If counting, the required buffer size is returned.
9292 * If filling, we know the buffer will be large enough, so we don't
9293 * need to pass in the buffer size.
9294 * Inserts thousand grouping characters (as defined by grouping and
9295 * thousands_sep) into @writer.
9296 *
9297 * Return value: -1 on error, number of characters otherwise.
9298 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009299Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009300_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009301 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009302 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009303 PyObject *digits,
9304 Py_ssize_t d_pos,
9305 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009306 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009307 const char *grouping,
9308 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009309 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009310{
Xtreak3f7983a2019-01-07 20:39:14 +05309311 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009312 if (writer) {
9313 assert(digits != NULL);
9314 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009315 }
9316 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009317 assert(digits == NULL);
9318 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009319 }
Victor Stinner59423e32018-11-26 13:40:01 +01009320 assert(0 <= d_pos);
9321 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009322 assert(grouping != NULL);
9323
9324 if (digits != NULL) {
9325 if (PyUnicode_READY(digits) == -1) {
9326 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009327 }
Victor Stinner59423e32018-11-26 13:40:01 +01009328 }
9329 if (PyUnicode_READY(thousands_sep) == -1) {
9330 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009331 }
9332
Victor Stinner59423e32018-11-26 13:40:01 +01009333 Py_ssize_t count = 0;
9334 Py_ssize_t n_zeros;
9335 int loop_broken = 0;
9336 int use_separator = 0; /* First time through, don't append the
9337 separator. They only go between
9338 groups. */
9339 Py_ssize_t buffer_pos;
9340 Py_ssize_t digits_pos;
9341 Py_ssize_t len;
9342 Py_ssize_t n_chars;
9343 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9344 be looked at */
9345 /* A generator that returns all of the grouping widths, until it
9346 returns 0. */
9347 GroupGenerator groupgen;
9348 GroupGenerator_init(&groupgen, grouping);
9349 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9350
9351 /* if digits are not grouped, thousands separator
9352 should be an empty string */
9353 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9354
9355 digits_pos = d_pos + n_digits;
9356 if (writer) {
9357 buffer_pos = writer->pos + n_buffer;
9358 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9359 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 }
Victor Stinner59423e32018-11-26 13:40:01 +01009361 else {
9362 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009363 }
Victor Stinner59423e32018-11-26 13:40:01 +01009364
9365 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009366 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009367 }
Victor Stinner59423e32018-11-26 13:40:01 +01009368
9369 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9370 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9371 n_zeros = Py_MAX(0, len - remaining);
9372 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9373
9374 /* Use n_zero zero's and n_chars chars */
9375
9376 /* Count only, don't do anything. */
9377 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9378
9379 /* Copy into the writer. */
9380 InsertThousandsGrouping_fill(writer, &buffer_pos,
9381 digits, &digits_pos,
9382 n_chars, n_zeros,
9383 use_separator ? thousands_sep : NULL,
9384 thousands_sep_len, maxchar);
9385
9386 /* Use a separator next time. */
9387 use_separator = 1;
9388
9389 remaining -= n_chars;
9390 min_width -= len;
9391
9392 if (remaining <= 0 && min_width <= 0) {
9393 loop_broken = 1;
9394 break;
9395 }
9396 min_width -= thousands_sep_len;
9397 }
9398 if (!loop_broken) {
9399 /* We left the loop without using a break statement. */
9400
9401 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9402 n_zeros = Py_MAX(0, len - remaining);
9403 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9404
9405 /* Use n_zero zero's and n_chars chars */
9406 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9407
9408 /* Copy into the writer. */
9409 InsertThousandsGrouping_fill(writer, &buffer_pos,
9410 digits, &digits_pos,
9411 n_chars, n_zeros,
9412 use_separator ? thousands_sep : NULL,
9413 thousands_sep_len, maxchar);
9414 }
9415 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009416}
9417
9418
Alexander Belopolsky40018472011-02-26 01:02:56 +00009419Py_ssize_t
9420PyUnicode_Count(PyObject *str,
9421 PyObject *substr,
9422 Py_ssize_t start,
9423 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009424{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009425 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009426 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009427 void *buf1 = NULL, *buf2 = NULL;
9428 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009429
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009430 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009431 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009432
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009433 kind1 = PyUnicode_KIND(str);
9434 kind2 = PyUnicode_KIND(substr);
9435 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009436 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009437
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009438 len1 = PyUnicode_GET_LENGTH(str);
9439 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009440 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009441 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009442 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009443
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009444 buf1 = PyUnicode_DATA(str);
9445 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009446 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009447 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009448 if (!buf2)
9449 goto onError;
9450 }
9451
9452 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009453 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009454 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009455 result = asciilib_count(
9456 ((Py_UCS1*)buf1) + start, end - start,
9457 buf2, len2, PY_SSIZE_T_MAX
9458 );
9459 else
9460 result = ucs1lib_count(
9461 ((Py_UCS1*)buf1) + start, end - start,
9462 buf2, len2, PY_SSIZE_T_MAX
9463 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464 break;
9465 case PyUnicode_2BYTE_KIND:
9466 result = ucs2lib_count(
9467 ((Py_UCS2*)buf1) + start, end - start,
9468 buf2, len2, PY_SSIZE_T_MAX
9469 );
9470 break;
9471 case PyUnicode_4BYTE_KIND:
9472 result = ucs4lib_count(
9473 ((Py_UCS4*)buf1) + start, end - start,
9474 buf2, len2, PY_SSIZE_T_MAX
9475 );
9476 break;
9477 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009478 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009479 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009480
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009481 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009482 PyMem_Free(buf2);
9483
Guido van Rossumd57fd912000-03-10 22:53:23 +00009484 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009485 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009486 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009487 PyMem_Free(buf2);
9488 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009489}
9490
Alexander Belopolsky40018472011-02-26 01:02:56 +00009491Py_ssize_t
9492PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009493 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009494 Py_ssize_t start,
9495 Py_ssize_t end,
9496 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009497{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009498 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009499 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009500
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009501 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502}
9503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009504Py_ssize_t
9505PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9506 Py_ssize_t start, Py_ssize_t end,
9507 int direction)
9508{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009509 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009510 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009511 if (PyUnicode_READY(str) == -1)
9512 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009513 len = PyUnicode_GET_LENGTH(str);
9514 ADJUST_INDICES(start, end, len);
9515 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009516 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009517 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009518 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9519 kind, end-start, ch, direction);
9520 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009521 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009522 else
9523 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009524}
9525
Alexander Belopolsky40018472011-02-26 01:02:56 +00009526static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009527tailmatch(PyObject *self,
9528 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009529 Py_ssize_t start,
9530 Py_ssize_t end,
9531 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009532{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009533 int kind_self;
9534 int kind_sub;
9535 void *data_self;
9536 void *data_sub;
9537 Py_ssize_t offset;
9538 Py_ssize_t i;
9539 Py_ssize_t end_sub;
9540
9541 if (PyUnicode_READY(self) == -1 ||
9542 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009543 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009544
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009545 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9546 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009547 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009548 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009549
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009550 if (PyUnicode_GET_LENGTH(substring) == 0)
9551 return 1;
9552
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009553 kind_self = PyUnicode_KIND(self);
9554 data_self = PyUnicode_DATA(self);
9555 kind_sub = PyUnicode_KIND(substring);
9556 data_sub = PyUnicode_DATA(substring);
9557 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9558
9559 if (direction > 0)
9560 offset = end;
9561 else
9562 offset = start;
9563
9564 if (PyUnicode_READ(kind_self, data_self, offset) ==
9565 PyUnicode_READ(kind_sub, data_sub, 0) &&
9566 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9567 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9568 /* If both are of the same kind, memcmp is sufficient */
9569 if (kind_self == kind_sub) {
9570 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009571 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009572 data_sub,
9573 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009574 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009575 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009576 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577 else {
9578 /* We do not need to compare 0 and len(substring)-1 because
9579 the if statement above ensured already that they are equal
9580 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009581 for (i = 1; i < end_sub; ++i) {
9582 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9583 PyUnicode_READ(kind_sub, data_sub, i))
9584 return 0;
9585 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009586 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009587 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009588 }
9589
9590 return 0;
9591}
9592
Alexander Belopolsky40018472011-02-26 01:02:56 +00009593Py_ssize_t
9594PyUnicode_Tailmatch(PyObject *str,
9595 PyObject *substr,
9596 Py_ssize_t start,
9597 Py_ssize_t end,
9598 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009599{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009600 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009601 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009602
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009603 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009604}
9605
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009606static PyObject *
9607ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009608{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009609 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9610 char *resdata, *data = PyUnicode_DATA(self);
9611 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009612
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009613 res = PyUnicode_New(len, 127);
9614 if (res == NULL)
9615 return NULL;
9616 resdata = PyUnicode_DATA(res);
9617 if (lower)
9618 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009619 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009620 _Py_bytes_upper(resdata, data, len);
9621 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009622}
9623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009624static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009625handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009626{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009627 Py_ssize_t j;
9628 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009629 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009630 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009631
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009632 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9633
9634 where ! is a negation and \p{xxx} is a character with property xxx.
9635 */
9636 for (j = i - 1; j >= 0; j--) {
9637 c = PyUnicode_READ(kind, data, j);
9638 if (!_PyUnicode_IsCaseIgnorable(c))
9639 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009640 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009641 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9642 if (final_sigma) {
9643 for (j = i + 1; j < length; j++) {
9644 c = PyUnicode_READ(kind, data, j);
9645 if (!_PyUnicode_IsCaseIgnorable(c))
9646 break;
9647 }
9648 final_sigma = j == length || !_PyUnicode_IsCased(c);
9649 }
9650 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009651}
9652
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009653static int
9654lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9655 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009656{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009657 /* Obscure special case. */
9658 if (c == 0x3A3) {
9659 mapped[0] = handle_capital_sigma(kind, data, length, i);
9660 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009661 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009662 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009663}
9664
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009665static Py_ssize_t
9666do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009667{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009668 Py_ssize_t i, k = 0;
9669 int n_res, j;
9670 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009671
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009672 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009673 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009674 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009675 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009676 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009677 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009678 for (i = 1; i < length; i++) {
9679 c = PyUnicode_READ(kind, data, i);
9680 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9681 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009682 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009683 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009684 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009685 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009686 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009687}
9688
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009689static Py_ssize_t
9690do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9691 Py_ssize_t i, k = 0;
9692
9693 for (i = 0; i < length; i++) {
9694 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9695 int n_res, j;
9696 if (Py_UNICODE_ISUPPER(c)) {
9697 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9698 }
9699 else if (Py_UNICODE_ISLOWER(c)) {
9700 n_res = _PyUnicode_ToUpperFull(c, mapped);
9701 }
9702 else {
9703 n_res = 1;
9704 mapped[0] = c;
9705 }
9706 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009707 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009708 res[k++] = mapped[j];
9709 }
9710 }
9711 return k;
9712}
9713
9714static Py_ssize_t
9715do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9716 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009717{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009718 Py_ssize_t i, k = 0;
9719
9720 for (i = 0; i < length; i++) {
9721 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9722 int n_res, j;
9723 if (lower)
9724 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9725 else
9726 n_res = _PyUnicode_ToUpperFull(c, mapped);
9727 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009728 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009729 res[k++] = mapped[j];
9730 }
9731 }
9732 return k;
9733}
9734
9735static Py_ssize_t
9736do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9737{
9738 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9739}
9740
9741static Py_ssize_t
9742do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9743{
9744 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9745}
9746
Benjamin Petersone51757f2012-01-12 21:10:29 -05009747static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009748do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9749{
9750 Py_ssize_t i, k = 0;
9751
9752 for (i = 0; i < length; i++) {
9753 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9754 Py_UCS4 mapped[3];
9755 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9756 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009757 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009758 res[k++] = mapped[j];
9759 }
9760 }
9761 return k;
9762}
9763
9764static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009765do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9766{
9767 Py_ssize_t i, k = 0;
9768 int previous_is_cased;
9769
9770 previous_is_cased = 0;
9771 for (i = 0; i < length; i++) {
9772 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9773 Py_UCS4 mapped[3];
9774 int n_res, j;
9775
9776 if (previous_is_cased)
9777 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9778 else
9779 n_res = _PyUnicode_ToTitleFull(c, mapped);
9780
9781 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009782 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009783 res[k++] = mapped[j];
9784 }
9785
9786 previous_is_cased = _PyUnicode_IsCased(c);
9787 }
9788 return k;
9789}
9790
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009791static PyObject *
9792case_operation(PyObject *self,
9793 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9794{
9795 PyObject *res = NULL;
9796 Py_ssize_t length, newlength = 0;
9797 int kind, outkind;
9798 void *data, *outdata;
9799 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9800
Benjamin Petersoneea48462012-01-16 14:28:50 -05009801 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009802
9803 kind = PyUnicode_KIND(self);
9804 data = PyUnicode_DATA(self);
9805 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009806 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009807 PyErr_SetString(PyExc_OverflowError, "string is too long");
9808 return NULL;
9809 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009810 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009811 if (tmp == NULL)
9812 return PyErr_NoMemory();
9813 newlength = perform(kind, data, length, tmp, &maxchar);
9814 res = PyUnicode_New(newlength, maxchar);
9815 if (res == NULL)
9816 goto leave;
9817 tmpend = tmp + newlength;
9818 outdata = PyUnicode_DATA(res);
9819 outkind = PyUnicode_KIND(res);
9820 switch (outkind) {
9821 case PyUnicode_1BYTE_KIND:
9822 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9823 break;
9824 case PyUnicode_2BYTE_KIND:
9825 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9826 break;
9827 case PyUnicode_4BYTE_KIND:
9828 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9829 break;
9830 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009831 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009832 }
9833 leave:
9834 PyMem_FREE(tmp);
9835 return res;
9836}
9837
Tim Peters8ce9f162004-08-27 01:49:32 +00009838PyObject *
9839PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009840{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009841 PyObject *res;
9842 PyObject *fseq;
9843 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009844 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009845
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009846 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009847 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009848 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009849 }
9850
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009851 /* NOTE: the following code can't call back into Python code,
9852 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009853 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009854
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009855 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009856 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009857 res = _PyUnicode_JoinArray(separator, items, seqlen);
9858 Py_DECREF(fseq);
9859 return res;
9860}
9861
9862PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +02009863_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009864{
9865 PyObject *res = NULL; /* the result */
9866 PyObject *sep = NULL;
9867 Py_ssize_t seplen;
9868 PyObject *item;
9869 Py_ssize_t sz, i, res_offset;
9870 Py_UCS4 maxchar;
9871 Py_UCS4 item_maxchar;
9872 int use_memcpy;
9873 unsigned char *res_data = NULL, *sep_data = NULL;
9874 PyObject *last_obj;
9875 unsigned int kind = 0;
9876
Tim Peters05eba1f2004-08-27 21:32:02 +00009877 /* If empty sequence, return u"". */
9878 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009879 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009880 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009881
Tim Peters05eba1f2004-08-27 21:32:02 +00009882 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009883 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009884 if (seqlen == 1) {
9885 if (PyUnicode_CheckExact(items[0])) {
9886 res = items[0];
9887 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009888 return res;
9889 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009890 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009891 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009892 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009893 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009894 /* Set up sep and seplen */
9895 if (separator == NULL) {
9896 /* fall back to a blank space separator */
9897 sep = PyUnicode_FromOrdinal(' ');
9898 if (!sep)
9899 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009900 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009901 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009902 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009903 else {
9904 if (!PyUnicode_Check(separator)) {
9905 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009906 "separator: expected str instance,"
9907 " %.80s found",
9908 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +02009909 goto onError;
9910 }
9911 if (PyUnicode_READY(separator))
9912 goto onError;
9913 sep = separator;
9914 seplen = PyUnicode_GET_LENGTH(separator);
9915 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9916 /* inc refcount to keep this code path symmetric with the
9917 above case of a blank separator */
9918 Py_INCREF(sep);
9919 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009920 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009921 }
9922
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009923 /* There are at least two things to join, or else we have a subclass
9924 * of str in the sequence.
9925 * Do a pre-pass to figure out the total amount of space we'll
9926 * need (sz), and see whether all argument are strings.
9927 */
9928 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009929#ifdef Py_DEBUG
9930 use_memcpy = 0;
9931#else
9932 use_memcpy = 1;
9933#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009934 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +08009935 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009936 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009937 if (!PyUnicode_Check(item)) {
9938 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009939 "sequence item %zd: expected str instance,"
9940 " %.80s found",
9941 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009942 goto onError;
9943 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009944 if (PyUnicode_READY(item) == -1)
9945 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +08009946 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009947 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009948 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +08009949 if (i != 0) {
9950 add_sz += seplen;
9951 }
9952 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009953 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009954 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009955 goto onError;
9956 }
Xiang Zhangb0541f42017-01-10 10:52:00 +08009957 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +02009958 if (use_memcpy && last_obj != NULL) {
9959 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9960 use_memcpy = 0;
9961 }
9962 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009963 }
Tim Petersced69f82003-09-16 20:30:58 +00009964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009965 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009966 if (res == NULL)
9967 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009968
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009969 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009970#ifdef Py_DEBUG
9971 use_memcpy = 0;
9972#else
9973 if (use_memcpy) {
9974 res_data = PyUnicode_1BYTE_DATA(res);
9975 kind = PyUnicode_KIND(res);
9976 if (seplen != 0)
9977 sep_data = PyUnicode_1BYTE_DATA(sep);
9978 }
9979#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009980 if (use_memcpy) {
9981 for (i = 0; i < seqlen; ++i) {
9982 Py_ssize_t itemlen;
9983 item = items[i];
9984
9985 /* Copy item, and maybe the separator. */
9986 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +02009987 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +02009988 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009989 kind * seplen);
9990 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009991 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009992
9993 itemlen = PyUnicode_GET_LENGTH(item);
9994 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +02009995 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +02009996 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009997 kind * itemlen);
9998 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009999 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010000 }
10001 assert(res_data == PyUnicode_1BYTE_DATA(res)
10002 + kind * PyUnicode_GET_LENGTH(res));
10003 }
10004 else {
10005 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10006 Py_ssize_t itemlen;
10007 item = items[i];
10008
10009 /* Copy item, and maybe the separator. */
10010 if (i && seplen != 0) {
10011 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10012 res_offset += seplen;
10013 }
10014
10015 itemlen = PyUnicode_GET_LENGTH(item);
10016 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010017 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010018 res_offset += itemlen;
10019 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010020 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010021 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010022 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010025 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010026 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010027
Benjamin Peterson29060642009-01-31 22:14:21 +000010028 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010029 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010030 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010031 return NULL;
10032}
10033
Victor Stinnerd3f08822012-05-29 12:57:52 +020010034void
10035_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10036 Py_UCS4 fill_char)
10037{
10038 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010039 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010040 assert(PyUnicode_IS_READY(unicode));
10041 assert(unicode_modifiable(unicode));
10042 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10043 assert(start >= 0);
10044 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010045 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010046}
10047
Victor Stinner3fe55312012-01-04 00:33:50 +010010048Py_ssize_t
10049PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10050 Py_UCS4 fill_char)
10051{
10052 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010053
10054 if (!PyUnicode_Check(unicode)) {
10055 PyErr_BadInternalCall();
10056 return -1;
10057 }
10058 if (PyUnicode_READY(unicode) == -1)
10059 return -1;
10060 if (unicode_check_modifiable(unicode))
10061 return -1;
10062
Victor Stinnerd3f08822012-05-29 12:57:52 +020010063 if (start < 0) {
10064 PyErr_SetString(PyExc_IndexError, "string index out of range");
10065 return -1;
10066 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010067 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10068 PyErr_SetString(PyExc_ValueError,
10069 "fill character is bigger than "
10070 "the string maximum character");
10071 return -1;
10072 }
10073
10074 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10075 length = Py_MIN(maxlen, length);
10076 if (length <= 0)
10077 return 0;
10078
Victor Stinnerd3f08822012-05-29 12:57:52 +020010079 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010080 return length;
10081}
10082
Victor Stinner9310abb2011-10-05 00:59:23 +020010083static PyObject *
10084pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010085 Py_ssize_t left,
10086 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010087 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010088{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089 PyObject *u;
10090 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010091 int kind;
10092 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010093
10094 if (left < 0)
10095 left = 0;
10096 if (right < 0)
10097 right = 0;
10098
Victor Stinnerc4b49542011-12-11 22:44:26 +010010099 if (left == 0 && right == 0)
10100 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010101
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010102 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10103 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010104 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10105 return NULL;
10106 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010108 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010110 if (!u)
10111 return NULL;
10112
10113 kind = PyUnicode_KIND(u);
10114 data = PyUnicode_DATA(u);
10115 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010116 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010117 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010118 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010119 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010120 assert(_PyUnicode_CheckConsistency(u, 1));
10121 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010122}
10123
Alexander Belopolsky40018472011-02-26 01:02:56 +000010124PyObject *
10125PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010126{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010127 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010128
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010129 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010130 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010131
Benjamin Petersonead6b532011-12-20 17:23:42 -060010132 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010133 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010134 if (PyUnicode_IS_ASCII(string))
10135 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010136 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010137 PyUnicode_GET_LENGTH(string), keepends);
10138 else
10139 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010140 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010141 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 break;
10143 case PyUnicode_2BYTE_KIND:
10144 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010145 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010146 PyUnicode_GET_LENGTH(string), keepends);
10147 break;
10148 case PyUnicode_4BYTE_KIND:
10149 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010150 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010151 PyUnicode_GET_LENGTH(string), keepends);
10152 break;
10153 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010154 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010156 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010157}
10158
Alexander Belopolsky40018472011-02-26 01:02:56 +000010159static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010160split(PyObject *self,
10161 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010162 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010163{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010164 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 void *buf1, *buf2;
10166 Py_ssize_t len1, len2;
10167 PyObject* out;
10168
Guido van Rossumd57fd912000-03-10 22:53:23 +000010169 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010170 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 if (PyUnicode_READY(self) == -1)
10173 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010176 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010178 if (PyUnicode_IS_ASCII(self))
10179 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010180 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010181 PyUnicode_GET_LENGTH(self), maxcount
10182 );
10183 else
10184 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010185 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010186 PyUnicode_GET_LENGTH(self), maxcount
10187 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 case PyUnicode_2BYTE_KIND:
10189 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010190 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 PyUnicode_GET_LENGTH(self), maxcount
10192 );
10193 case PyUnicode_4BYTE_KIND:
10194 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010195 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010196 PyUnicode_GET_LENGTH(self), maxcount
10197 );
10198 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010199 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 }
10201
10202 if (PyUnicode_READY(substring) == -1)
10203 return NULL;
10204
10205 kind1 = PyUnicode_KIND(self);
10206 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 len1 = PyUnicode_GET_LENGTH(self);
10208 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010209 if (kind1 < kind2 || len1 < len2) {
10210 out = PyList_New(1);
10211 if (out == NULL)
10212 return NULL;
10213 Py_INCREF(self);
10214 PyList_SET_ITEM(out, 0, self);
10215 return out;
10216 }
10217 buf1 = PyUnicode_DATA(self);
10218 buf2 = PyUnicode_DATA(substring);
10219 if (kind2 != kind1) {
10220 buf2 = _PyUnicode_AsKind(substring, kind1);
10221 if (!buf2)
10222 return NULL;
10223 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010225 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010227 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10228 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010229 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010230 else
10231 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010232 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 break;
10234 case PyUnicode_2BYTE_KIND:
10235 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010236 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237 break;
10238 case PyUnicode_4BYTE_KIND:
10239 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010240 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010241 break;
10242 default:
10243 out = NULL;
10244 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010245 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 PyMem_Free(buf2);
10247 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010248}
10249
Alexander Belopolsky40018472011-02-26 01:02:56 +000010250static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010251rsplit(PyObject *self,
10252 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010253 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010254{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010255 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 void *buf1, *buf2;
10257 Py_ssize_t len1, len2;
10258 PyObject* out;
10259
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010260 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010261 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010262
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010263 if (PyUnicode_READY(self) == -1)
10264 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010266 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010267 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010269 if (PyUnicode_IS_ASCII(self))
10270 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010271 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010272 PyUnicode_GET_LENGTH(self), maxcount
10273 );
10274 else
10275 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010276 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010277 PyUnicode_GET_LENGTH(self), maxcount
10278 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010279 case PyUnicode_2BYTE_KIND:
10280 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010281 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 PyUnicode_GET_LENGTH(self), maxcount
10283 );
10284 case PyUnicode_4BYTE_KIND:
10285 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010286 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010287 PyUnicode_GET_LENGTH(self), maxcount
10288 );
10289 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010290 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010291 }
10292
10293 if (PyUnicode_READY(substring) == -1)
10294 return NULL;
10295
10296 kind1 = PyUnicode_KIND(self);
10297 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010298 len1 = PyUnicode_GET_LENGTH(self);
10299 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010300 if (kind1 < kind2 || len1 < len2) {
10301 out = PyList_New(1);
10302 if (out == NULL)
10303 return NULL;
10304 Py_INCREF(self);
10305 PyList_SET_ITEM(out, 0, self);
10306 return out;
10307 }
10308 buf1 = PyUnicode_DATA(self);
10309 buf2 = PyUnicode_DATA(substring);
10310 if (kind2 != kind1) {
10311 buf2 = _PyUnicode_AsKind(substring, kind1);
10312 if (!buf2)
10313 return NULL;
10314 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010316 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010318 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10319 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010320 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010321 else
10322 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010323 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010324 break;
10325 case PyUnicode_2BYTE_KIND:
10326 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010327 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 break;
10329 case PyUnicode_4BYTE_KIND:
10330 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010331 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010332 break;
10333 default:
10334 out = NULL;
10335 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010336 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 PyMem_Free(buf2);
10338 return out;
10339}
10340
10341static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010342anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10343 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010344{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010345 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010347 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10348 return asciilib_find(buf1, len1, buf2, len2, offset);
10349 else
10350 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 case PyUnicode_2BYTE_KIND:
10352 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10353 case PyUnicode_4BYTE_KIND:
10354 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10355 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010356 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357}
10358
10359static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010360anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10361 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010363 switch (kind) {
10364 case PyUnicode_1BYTE_KIND:
10365 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10366 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10367 else
10368 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10369 case PyUnicode_2BYTE_KIND:
10370 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10371 case PyUnicode_4BYTE_KIND:
10372 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10373 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010374 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010375}
10376
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010377static void
10378replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10379 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10380{
10381 int kind = PyUnicode_KIND(u);
10382 void *data = PyUnicode_DATA(u);
10383 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10384 if (kind == PyUnicode_1BYTE_KIND) {
10385 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10386 (Py_UCS1 *)data + len,
10387 u1, u2, maxcount);
10388 }
10389 else if (kind == PyUnicode_2BYTE_KIND) {
10390 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10391 (Py_UCS2 *)data + len,
10392 u1, u2, maxcount);
10393 }
10394 else {
10395 assert(kind == PyUnicode_4BYTE_KIND);
10396 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10397 (Py_UCS4 *)data + len,
10398 u1, u2, maxcount);
10399 }
10400}
10401
Alexander Belopolsky40018472011-02-26 01:02:56 +000010402static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010403replace(PyObject *self, PyObject *str1,
10404 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010405{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010406 PyObject *u;
10407 char *sbuf = PyUnicode_DATA(self);
10408 char *buf1 = PyUnicode_DATA(str1);
10409 char *buf2 = PyUnicode_DATA(str2);
10410 int srelease = 0, release1 = 0, release2 = 0;
10411 int skind = PyUnicode_KIND(self);
10412 int kind1 = PyUnicode_KIND(str1);
10413 int kind2 = PyUnicode_KIND(str2);
10414 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10415 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10416 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010417 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010418 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010419
10420 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010421 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010422 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010423 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010424
Victor Stinner59de0ee2011-10-07 10:01:28 +020010425 if (str1 == str2)
10426 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010427
Victor Stinner49a0a212011-10-12 23:46:10 +020010428 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010429 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10430 if (maxchar < maxchar_str1)
10431 /* substring too wide to be present */
10432 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010433 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10434 /* Replacing str1 with str2 may cause a maxchar reduction in the
10435 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010436 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010437 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010438
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010440 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010442 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010443 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010444 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010445 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010446 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010447
Victor Stinner69ed0f42013-04-09 21:48:24 +020010448 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010449 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010450 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010451 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010452 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010454 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010455 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010456
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010457 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10458 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010459 }
10460 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010461 int rkind = skind;
10462 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010463 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010464
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010465 if (kind1 < rkind) {
10466 /* widen substring */
10467 buf1 = _PyUnicode_AsKind(str1, rkind);
10468 if (!buf1) goto error;
10469 release1 = 1;
10470 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010471 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010472 if (i < 0)
10473 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 if (rkind > kind2) {
10475 /* widen replacement */
10476 buf2 = _PyUnicode_AsKind(str2, rkind);
10477 if (!buf2) goto error;
10478 release2 = 1;
10479 }
10480 else if (rkind < kind2) {
10481 /* widen self and buf1 */
10482 rkind = kind2;
10483 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010484 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 sbuf = _PyUnicode_AsKind(self, rkind);
10486 if (!sbuf) goto error;
10487 srelease = 1;
10488 buf1 = _PyUnicode_AsKind(str1, rkind);
10489 if (!buf1) goto error;
10490 release1 = 1;
10491 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010492 u = PyUnicode_New(slen, maxchar);
10493 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010495 assert(PyUnicode_KIND(u) == rkind);
10496 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010497
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010498 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010499 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010500 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010502 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010503 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010504
10505 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010506 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010507 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010508 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010509 if (i == -1)
10510 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010511 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010513 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010515 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010516 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010517 }
10518 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010520 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 int rkind = skind;
10522 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010523
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010525 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010526 buf1 = _PyUnicode_AsKind(str1, rkind);
10527 if (!buf1) goto error;
10528 release1 = 1;
10529 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010530 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010531 if (n == 0)
10532 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010534 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 buf2 = _PyUnicode_AsKind(str2, rkind);
10536 if (!buf2) goto error;
10537 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010538 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010540 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010541 rkind = kind2;
10542 sbuf = _PyUnicode_AsKind(self, rkind);
10543 if (!sbuf) goto error;
10544 srelease = 1;
10545 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010546 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 buf1 = _PyUnicode_AsKind(str1, rkind);
10548 if (!buf1) goto error;
10549 release1 = 1;
10550 }
10551 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10552 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010553 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010554 PyErr_SetString(PyExc_OverflowError,
10555 "replace string is too long");
10556 goto error;
10557 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010558 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010559 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010560 _Py_INCREF_UNICODE_EMPTY();
10561 if (!unicode_empty)
10562 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010563 u = unicode_empty;
10564 goto done;
10565 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010566 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010567 PyErr_SetString(PyExc_OverflowError,
10568 "replace string is too long");
10569 goto error;
10570 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010571 u = PyUnicode_New(new_size, maxchar);
10572 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010574 assert(PyUnicode_KIND(u) == rkind);
10575 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 ires = i = 0;
10577 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010578 while (n-- > 0) {
10579 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010580 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010581 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010582 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010583 if (j == -1)
10584 break;
10585 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010586 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010587 memcpy(res + rkind * ires,
10588 sbuf + rkind * i,
10589 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010590 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010591 }
10592 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010594 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010596 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010598 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010600 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010602 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010603 memcpy(res + rkind * ires,
10604 sbuf + rkind * i,
10605 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010606 }
10607 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010608 /* interleave */
10609 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010610 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010612 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010614 if (--n <= 0)
10615 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010616 memcpy(res + rkind * ires,
10617 sbuf + rkind * i,
10618 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 ires++;
10620 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010621 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010622 memcpy(res + rkind * ires,
10623 sbuf + rkind * i,
10624 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010625 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010626 }
10627
10628 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010629 unicode_adjust_maxchar(&u);
10630 if (u == NULL)
10631 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010632 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010633
10634 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 if (srelease)
10636 PyMem_FREE(sbuf);
10637 if (release1)
10638 PyMem_FREE(buf1);
10639 if (release2)
10640 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010641 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010643
Benjamin Peterson29060642009-01-31 22:14:21 +000010644 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010645 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 if (srelease)
10647 PyMem_FREE(sbuf);
10648 if (release1)
10649 PyMem_FREE(buf1);
10650 if (release2)
10651 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010652 return unicode_result_unchanged(self);
10653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 error:
10655 if (srelease && sbuf)
10656 PyMem_FREE(sbuf);
10657 if (release1 && buf1)
10658 PyMem_FREE(buf1);
10659 if (release2 && buf2)
10660 PyMem_FREE(buf2);
10661 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010662}
10663
10664/* --- Unicode Object Methods --------------------------------------------- */
10665
INADA Naoki3ae20562017-01-16 20:41:20 +090010666/*[clinic input]
10667str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010668
INADA Naoki3ae20562017-01-16 20:41:20 +090010669Return a version of the string where each word is titlecased.
10670
10671More specifically, words start with uppercased characters and all remaining
10672cased characters have lower case.
10673[clinic start generated code]*/
10674
10675static PyObject *
10676unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010677/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010678{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010679 if (PyUnicode_READY(self) == -1)
10680 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010681 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010682}
10683
INADA Naoki3ae20562017-01-16 20:41:20 +090010684/*[clinic input]
10685str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010686
INADA Naoki3ae20562017-01-16 20:41:20 +090010687Return a capitalized version of the string.
10688
10689More specifically, make the first character have upper case and the rest lower
10690case.
10691[clinic start generated code]*/
10692
10693static PyObject *
10694unicode_capitalize_impl(PyObject *self)
10695/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010696{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010697 if (PyUnicode_READY(self) == -1)
10698 return NULL;
10699 if (PyUnicode_GET_LENGTH(self) == 0)
10700 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010701 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010702}
10703
INADA Naoki3ae20562017-01-16 20:41:20 +090010704/*[clinic input]
10705str.casefold as unicode_casefold
10706
10707Return a version of the string suitable for caseless comparisons.
10708[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010709
10710static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010711unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010712/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010713{
10714 if (PyUnicode_READY(self) == -1)
10715 return NULL;
10716 if (PyUnicode_IS_ASCII(self))
10717 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010718 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010719}
10720
10721
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010722/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010723
10724static int
10725convert_uc(PyObject *obj, void *addr)
10726{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010727 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010728
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010729 if (!PyUnicode_Check(obj)) {
10730 PyErr_Format(PyExc_TypeError,
10731 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010732 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010733 return 0;
10734 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010735 if (PyUnicode_READY(obj) < 0)
10736 return 0;
10737 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010738 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010739 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010740 return 0;
10741 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010742 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010743 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010744}
10745
INADA Naoki3ae20562017-01-16 20:41:20 +090010746/*[clinic input]
10747str.center as unicode_center
10748
10749 width: Py_ssize_t
10750 fillchar: Py_UCS4 = ' '
10751 /
10752
10753Return a centered string of length width.
10754
10755Padding is done using the specified fill character (default is a space).
10756[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757
10758static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010759unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10760/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010761{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010762 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010763
Benjamin Petersonbac79492012-01-14 13:34:47 -050010764 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010765 return NULL;
10766
Victor Stinnerc4b49542011-12-11 22:44:26 +010010767 if (PyUnicode_GET_LENGTH(self) >= width)
10768 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010769
Victor Stinnerc4b49542011-12-11 22:44:26 +010010770 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010771 left = marg / 2 + (marg & width & 1);
10772
Victor Stinner9310abb2011-10-05 00:59:23 +020010773 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774}
10775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776/* This function assumes that str1 and str2 are readied by the caller. */
10777
Marc-André Lemburge5034372000-08-08 08:04:29 +000010778static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010779unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010780{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010781#define COMPARE(TYPE1, TYPE2) \
10782 do { \
10783 TYPE1* p1 = (TYPE1 *)data1; \
10784 TYPE2* p2 = (TYPE2 *)data2; \
10785 TYPE1* end = p1 + len; \
10786 Py_UCS4 c1, c2; \
10787 for (; p1 != end; p1++, p2++) { \
10788 c1 = *p1; \
10789 c2 = *p2; \
10790 if (c1 != c2) \
10791 return (c1 < c2) ? -1 : 1; \
10792 } \
10793 } \
10794 while (0)
10795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010796 int kind1, kind2;
10797 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010798 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010800 kind1 = PyUnicode_KIND(str1);
10801 kind2 = PyUnicode_KIND(str2);
10802 data1 = PyUnicode_DATA(str1);
10803 data2 = PyUnicode_DATA(str2);
10804 len1 = PyUnicode_GET_LENGTH(str1);
10805 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010806 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010807
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010808 switch(kind1) {
10809 case PyUnicode_1BYTE_KIND:
10810 {
10811 switch(kind2) {
10812 case PyUnicode_1BYTE_KIND:
10813 {
10814 int cmp = memcmp(data1, data2, len);
10815 /* normalize result of memcmp() into the range [-1; 1] */
10816 if (cmp < 0)
10817 return -1;
10818 if (cmp > 0)
10819 return 1;
10820 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010821 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010822 case PyUnicode_2BYTE_KIND:
10823 COMPARE(Py_UCS1, Py_UCS2);
10824 break;
10825 case PyUnicode_4BYTE_KIND:
10826 COMPARE(Py_UCS1, Py_UCS4);
10827 break;
10828 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010829 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010830 }
10831 break;
10832 }
10833 case PyUnicode_2BYTE_KIND:
10834 {
10835 switch(kind2) {
10836 case PyUnicode_1BYTE_KIND:
10837 COMPARE(Py_UCS2, Py_UCS1);
10838 break;
10839 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010840 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010841 COMPARE(Py_UCS2, Py_UCS2);
10842 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010843 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010844 case PyUnicode_4BYTE_KIND:
10845 COMPARE(Py_UCS2, Py_UCS4);
10846 break;
10847 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010848 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010849 }
10850 break;
10851 }
10852 case PyUnicode_4BYTE_KIND:
10853 {
10854 switch(kind2) {
10855 case PyUnicode_1BYTE_KIND:
10856 COMPARE(Py_UCS4, Py_UCS1);
10857 break;
10858 case PyUnicode_2BYTE_KIND:
10859 COMPARE(Py_UCS4, Py_UCS2);
10860 break;
10861 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010862 {
10863#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10864 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10865 /* normalize result of wmemcmp() into the range [-1; 1] */
10866 if (cmp < 0)
10867 return -1;
10868 if (cmp > 0)
10869 return 1;
10870#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010871 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010872#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010873 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010874 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010875 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010876 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010877 }
10878 break;
10879 }
10880 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010881 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000010882 }
10883
Victor Stinner770e19e2012-10-04 22:59:45 +020010884 if (len1 == len2)
10885 return 0;
10886 if (len1 < len2)
10887 return -1;
10888 else
10889 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010890
10891#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010892}
10893
Benjamin Peterson621b4302016-09-09 13:54:34 -070010894static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010895unicode_compare_eq(PyObject *str1, PyObject *str2)
10896{
10897 int kind;
10898 void *data1, *data2;
10899 Py_ssize_t len;
10900 int cmp;
10901
Victor Stinnere5567ad2012-10-23 02:48:49 +020010902 len = PyUnicode_GET_LENGTH(str1);
10903 if (PyUnicode_GET_LENGTH(str2) != len)
10904 return 0;
10905 kind = PyUnicode_KIND(str1);
10906 if (PyUnicode_KIND(str2) != kind)
10907 return 0;
10908 data1 = PyUnicode_DATA(str1);
10909 data2 = PyUnicode_DATA(str2);
10910
10911 cmp = memcmp(data1, data2, len * kind);
10912 return (cmp == 0);
10913}
10914
10915
Alexander Belopolsky40018472011-02-26 01:02:56 +000010916int
10917PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010918{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010919 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10920 if (PyUnicode_READY(left) == -1 ||
10921 PyUnicode_READY(right) == -1)
10922 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010923
10924 /* a string is equal to itself */
10925 if (left == right)
10926 return 0;
10927
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010928 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010929 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010930 PyErr_Format(PyExc_TypeError,
10931 "Can't compare %.100s and %.100s",
10932 left->ob_type->tp_name,
10933 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010934 return -1;
10935}
10936
Martin v. Löwis5b222132007-06-10 09:51:05 +000010937int
10938PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10939{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010940 Py_ssize_t i;
10941 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010942 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010943 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010944
Victor Stinner910337b2011-10-03 03:20:16 +020010945 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010946 if (!PyUnicode_IS_READY(uni)) {
10947 const wchar_t *ws = _PyUnicode_WSTR(uni);
10948 /* Compare Unicode string and source character set string */
10949 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
10950 if (chr != ustr[i])
10951 return (chr < ustr[i]) ? -1 : 1;
10952 }
10953 /* This check keeps Python strings that end in '\0' from comparing equal
10954 to C strings identical up to that point. */
10955 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
10956 return 1; /* uni is longer */
10957 if (ustr[i])
10958 return -1; /* str is longer */
10959 return 0;
10960 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010961 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010962 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010963 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010964 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010965 size_t len, len2 = strlen(str);
10966 int cmp;
10967
10968 len = Py_MIN(len1, len2);
10969 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010970 if (cmp != 0) {
10971 if (cmp < 0)
10972 return -1;
10973 else
10974 return 1;
10975 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010976 if (len1 > len2)
10977 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010978 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010010979 return -1; /* str is longer */
10980 return 0;
10981 }
10982 else {
10983 void *data = PyUnicode_DATA(uni);
10984 /* Compare Unicode string and source character set string */
10985 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020010986 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010010987 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10988 /* This check keeps Python strings that end in '\0' from comparing equal
10989 to C strings identical up to that point. */
10990 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10991 return 1; /* uni is longer */
10992 if (str[i])
10993 return -1; /* str is longer */
10994 return 0;
10995 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010996}
10997
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020010998static int
10999non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11000{
11001 size_t i, len;
11002 const wchar_t *p;
11003 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11004 if (strlen(str) != len)
11005 return 0;
11006 p = _PyUnicode_WSTR(unicode);
11007 assert(p);
11008 for (i = 0; i < len; i++) {
11009 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011010 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011011 return 0;
11012 }
11013 return 1;
11014}
11015
11016int
11017_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11018{
11019 size_t len;
11020 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011021 assert(str);
11022#ifndef NDEBUG
11023 for (const char *p = str; *p; p++) {
11024 assert((unsigned char)*p < 128);
11025 }
11026#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011027 if (PyUnicode_READY(unicode) == -1) {
11028 /* Memory error or bad data */
11029 PyErr_Clear();
11030 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11031 }
11032 if (!PyUnicode_IS_ASCII(unicode))
11033 return 0;
11034 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11035 return strlen(str) == len &&
11036 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11037}
11038
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011039int
11040_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11041{
11042 PyObject *right_uni;
11043 Py_hash_t hash;
11044
11045 assert(_PyUnicode_CHECK(left));
11046 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011047#ifndef NDEBUG
11048 for (const char *p = right->string; *p; p++) {
11049 assert((unsigned char)*p < 128);
11050 }
11051#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011052
11053 if (PyUnicode_READY(left) == -1) {
11054 /* memory error or bad data */
11055 PyErr_Clear();
11056 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11057 }
11058
11059 if (!PyUnicode_IS_ASCII(left))
11060 return 0;
11061
11062 right_uni = _PyUnicode_FromId(right); /* borrowed */
11063 if (right_uni == NULL) {
11064 /* memory error or bad data */
11065 PyErr_Clear();
11066 return _PyUnicode_EqualToASCIIString(left, right->string);
11067 }
11068
11069 if (left == right_uni)
11070 return 1;
11071
11072 if (PyUnicode_CHECK_INTERNED(left))
11073 return 0;
11074
INADA Naoki7cc95f52018-01-28 02:07:09 +090011075 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011076 hash = _PyUnicode_HASH(left);
11077 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11078 return 0;
11079
11080 return unicode_compare_eq(left, right_uni);
11081}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011082
Alexander Belopolsky40018472011-02-26 01:02:56 +000011083PyObject *
11084PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011085{
11086 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011087
Victor Stinnere5567ad2012-10-23 02:48:49 +020011088 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11089 Py_RETURN_NOTIMPLEMENTED;
11090
11091 if (PyUnicode_READY(left) == -1 ||
11092 PyUnicode_READY(right) == -1)
11093 return NULL;
11094
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011095 if (left == right) {
11096 switch (op) {
11097 case Py_EQ:
11098 case Py_LE:
11099 case Py_GE:
11100 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011101 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011102 case Py_NE:
11103 case Py_LT:
11104 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011105 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011106 default:
11107 PyErr_BadArgument();
11108 return NULL;
11109 }
11110 }
11111 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011112 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011113 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011114 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011115 }
11116 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011117 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011118 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011119 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011120}
11121
Alexander Belopolsky40018472011-02-26 01:02:56 +000011122int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011123_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11124{
11125 return unicode_eq(aa, bb);
11126}
11127
11128int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011129PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011130{
Victor Stinner77282cb2013-04-14 19:22:47 +020011131 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011132 void *buf1, *buf2;
11133 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011134 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011135
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011136 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011137 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011138 "'in <string>' requires string as left operand, not %.100s",
11139 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011140 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011141 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011142 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011143 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011144 if (ensure_unicode(str) < 0)
11145 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011147 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011148 kind2 = PyUnicode_KIND(substr);
11149 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011150 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011151 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011152 len2 = PyUnicode_GET_LENGTH(substr);
11153 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011154 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011155 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011156 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011157 if (len2 == 1) {
11158 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11159 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011160 return result;
11161 }
11162 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011163 buf2 = _PyUnicode_AsKind(substr, kind1);
11164 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011165 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011166 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011167
Victor Stinner77282cb2013-04-14 19:22:47 +020011168 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011169 case PyUnicode_1BYTE_KIND:
11170 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11171 break;
11172 case PyUnicode_2BYTE_KIND:
11173 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11174 break;
11175 case PyUnicode_4BYTE_KIND:
11176 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11177 break;
11178 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011179 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011180 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011181
Victor Stinner77282cb2013-04-14 19:22:47 +020011182 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011183 PyMem_Free(buf2);
11184
Guido van Rossum403d68b2000-03-13 15:55:09 +000011185 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011186}
11187
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188/* Concat to string or Unicode object giving a new Unicode object. */
11189
Alexander Belopolsky40018472011-02-26 01:02:56 +000011190PyObject *
11191PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011192{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011193 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011194 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011195 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011196
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011197 if (ensure_unicode(left) < 0)
11198 return NULL;
11199
11200 if (!PyUnicode_Check(right)) {
11201 PyErr_Format(PyExc_TypeError,
11202 "can only concatenate str (not \"%.200s\") to str",
11203 right->ob_type->tp_name);
11204 return NULL;
11205 }
11206 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011207 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011208
11209 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011210 if (left == unicode_empty)
11211 return PyUnicode_FromObject(right);
11212 if (right == unicode_empty)
11213 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011214
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011215 left_len = PyUnicode_GET_LENGTH(left);
11216 right_len = PyUnicode_GET_LENGTH(right);
11217 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011218 PyErr_SetString(PyExc_OverflowError,
11219 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011220 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011221 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011222 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011223
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011224 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11225 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011226 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011227
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011229 result = PyUnicode_New(new_len, maxchar);
11230 if (result == NULL)
11231 return NULL;
11232 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11233 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11234 assert(_PyUnicode_CheckConsistency(result, 1));
11235 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011236}
11237
Walter Dörwald1ab83302007-05-18 17:15:44 +000011238void
Victor Stinner23e56682011-10-03 03:54:37 +020011239PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011240{
Victor Stinner23e56682011-10-03 03:54:37 +020011241 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011242 Py_UCS4 maxchar, maxchar2;
11243 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011244
11245 if (p_left == NULL) {
11246 if (!PyErr_Occurred())
11247 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011248 return;
11249 }
Victor Stinner23e56682011-10-03 03:54:37 +020011250 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011251 if (right == NULL || left == NULL
11252 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011253 if (!PyErr_Occurred())
11254 PyErr_BadInternalCall();
11255 goto error;
11256 }
11257
Benjamin Petersonbac79492012-01-14 13:34:47 -050011258 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011259 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011260 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011261 goto error;
11262
Victor Stinner488fa492011-12-12 00:01:39 +010011263 /* Shortcuts */
11264 if (left == unicode_empty) {
11265 Py_DECREF(left);
11266 Py_INCREF(right);
11267 *p_left = right;
11268 return;
11269 }
11270 if (right == unicode_empty)
11271 return;
11272
11273 left_len = PyUnicode_GET_LENGTH(left);
11274 right_len = PyUnicode_GET_LENGTH(right);
11275 if (left_len > PY_SSIZE_T_MAX - right_len) {
11276 PyErr_SetString(PyExc_OverflowError,
11277 "strings are too large to concat");
11278 goto error;
11279 }
11280 new_len = left_len + right_len;
11281
11282 if (unicode_modifiable(left)
11283 && PyUnicode_CheckExact(right)
11284 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011285 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11286 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011287 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011288 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011289 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11290 {
11291 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011292 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011293 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011294
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011295 /* copy 'right' into the newly allocated area of 'left' */
11296 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011297 }
Victor Stinner488fa492011-12-12 00:01:39 +010011298 else {
11299 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11300 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011301 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011302
Victor Stinner488fa492011-12-12 00:01:39 +010011303 /* Concat the two Unicode strings */
11304 res = PyUnicode_New(new_len, maxchar);
11305 if (res == NULL)
11306 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011307 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11308 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011309 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011310 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011311 }
11312 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011313 return;
11314
11315error:
Victor Stinner488fa492011-12-12 00:01:39 +010011316 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011317}
11318
11319void
11320PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11321{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011322 PyUnicode_Append(pleft, right);
11323 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011324}
11325
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011326/*
11327Wraps stringlib_parse_args_finds() and additionally ensures that the
11328first argument is a unicode object.
11329*/
11330
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011331static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011332parse_args_finds_unicode(const char * function_name, PyObject *args,
11333 PyObject **substring,
11334 Py_ssize_t *start, Py_ssize_t *end)
11335{
11336 if(stringlib_parse_args_finds(function_name, args, substring,
11337 start, end)) {
11338 if (ensure_unicode(*substring) < 0)
11339 return 0;
11340 return 1;
11341 }
11342 return 0;
11343}
11344
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011345PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011346 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011348Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011349string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011350interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351
11352static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011353unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011355 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011356 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011357 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011358 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011359 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011360 void *buf1, *buf2;
11361 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011363 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011364 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011365
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011366 kind1 = PyUnicode_KIND(self);
11367 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011368 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011369 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011371 len1 = PyUnicode_GET_LENGTH(self);
11372 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011373 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011374 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011375 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011376
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011377 buf1 = PyUnicode_DATA(self);
11378 buf2 = PyUnicode_DATA(substring);
11379 if (kind2 != kind1) {
11380 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011381 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011382 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011383 }
11384 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011385 case PyUnicode_1BYTE_KIND:
11386 iresult = ucs1lib_count(
11387 ((Py_UCS1*)buf1) + start, end - start,
11388 buf2, len2, PY_SSIZE_T_MAX
11389 );
11390 break;
11391 case PyUnicode_2BYTE_KIND:
11392 iresult = ucs2lib_count(
11393 ((Py_UCS2*)buf1) + start, end - start,
11394 buf2, len2, PY_SSIZE_T_MAX
11395 );
11396 break;
11397 case PyUnicode_4BYTE_KIND:
11398 iresult = ucs4lib_count(
11399 ((Py_UCS4*)buf1) + start, end - start,
11400 buf2, len2, PY_SSIZE_T_MAX
11401 );
11402 break;
11403 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011404 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011405 }
11406
11407 result = PyLong_FromSsize_t(iresult);
11408
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011409 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011410 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011411
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412 return result;
11413}
11414
INADA Naoki3ae20562017-01-16 20:41:20 +090011415/*[clinic input]
11416str.encode as unicode_encode
11417
11418 encoding: str(c_default="NULL") = 'utf-8'
11419 The encoding in which to encode the string.
11420 errors: str(c_default="NULL") = 'strict'
11421 The error handling scheme to use for encoding errors.
11422 The default is 'strict' meaning that encoding errors raise a
11423 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11424 'xmlcharrefreplace' as well as any other name registered with
11425 codecs.register_error that can handle UnicodeEncodeErrors.
11426
11427Encode the string using the codec registered for encoding.
11428[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429
11430static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011431unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011432/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011434 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011435}
11436
INADA Naoki3ae20562017-01-16 20:41:20 +090011437/*[clinic input]
11438str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439
INADA Naoki3ae20562017-01-16 20:41:20 +090011440 tabsize: int = 8
11441
11442Return a copy where all tab characters are expanded using spaces.
11443
11444If tabsize is not given, a tab size of 8 characters is assumed.
11445[clinic start generated code]*/
11446
11447static PyObject *
11448unicode_expandtabs_impl(PyObject *self, int tabsize)
11449/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011451 Py_ssize_t i, j, line_pos, src_len, incr;
11452 Py_UCS4 ch;
11453 PyObject *u;
11454 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011455 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011456 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457
Antoine Pitrou22425222011-10-04 19:10:51 +020011458 if (PyUnicode_READY(self) == -1)
11459 return NULL;
11460
Thomas Wouters7e474022000-07-16 12:04:32 +000011461 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011462 src_len = PyUnicode_GET_LENGTH(self);
11463 i = j = line_pos = 0;
11464 kind = PyUnicode_KIND(self);
11465 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011466 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011467 for (; i < src_len; i++) {
11468 ch = PyUnicode_READ(kind, src_data, i);
11469 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011470 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011471 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011472 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011473 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011474 goto overflow;
11475 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011476 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011477 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011478 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011480 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011481 goto overflow;
11482 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011484 if (ch == '\n' || ch == '\r')
11485 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011487 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011488 if (!found)
11489 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011490
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011492 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493 if (!u)
11494 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011495 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496
Antoine Pitroue71d5742011-10-04 15:55:09 +020011497 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498
Antoine Pitroue71d5742011-10-04 15:55:09 +020011499 for (; i < src_len; i++) {
11500 ch = PyUnicode_READ(kind, src_data, i);
11501 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011502 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011503 incr = tabsize - (line_pos % tabsize);
11504 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011505 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011506 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011507 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011508 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011509 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011510 line_pos++;
11511 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011512 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011513 if (ch == '\n' || ch == '\r')
11514 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011516 }
11517 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011518 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011519
Antoine Pitroue71d5742011-10-04 15:55:09 +020011520 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011521 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11522 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523}
11524
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011525PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011526 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527\n\
11528Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011529such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530arguments start and end are interpreted as in slice notation.\n\
11531\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011532Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533
11534static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011535unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011537 /* initialize variables to prevent gcc warning */
11538 PyObject *substring = NULL;
11539 Py_ssize_t start = 0;
11540 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011541 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011543 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011545
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011546 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011547 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011548
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011549 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011551 if (result == -2)
11552 return NULL;
11553
Christian Heimes217cfd12007-12-02 14:31:20 +000011554 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555}
11556
11557static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011558unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011560 void *data;
11561 enum PyUnicode_Kind kind;
11562 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011563
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011564 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011565 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011566 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011567 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011568 if (PyUnicode_READY(self) == -1) {
11569 return NULL;
11570 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011571 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11572 PyErr_SetString(PyExc_IndexError, "string index out of range");
11573 return NULL;
11574 }
11575 kind = PyUnicode_KIND(self);
11576 data = PyUnicode_DATA(self);
11577 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011578 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011579}
11580
Guido van Rossumc2504932007-09-18 19:42:40 +000011581/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011582 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011583static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011584unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011586 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011587
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011588#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011589 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011590#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011591 if (_PyUnicode_HASH(self) != -1)
11592 return _PyUnicode_HASH(self);
11593 if (PyUnicode_READY(self) == -1)
11594 return -1;
animalizea1d14252019-01-02 20:16:06 +080011595
Christian Heimes985ecdc2013-11-20 11:46:18 +010011596 x = _Py_HashBytes(PyUnicode_DATA(self),
11597 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011598 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011599 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011600}
11601
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011602PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011603 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604\n\
oldkaa0735f2018-02-02 16:52:55 +080011605Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011606such that sub is contained within S[start:end]. Optional\n\
11607arguments start and end are interpreted as in slice notation.\n\
11608\n\
11609Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610
11611static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011612unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011614 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011615 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011616 PyObject *substring = NULL;
11617 Py_ssize_t start = 0;
11618 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011620 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011622
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011623 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011624 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011625
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011626 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011628 if (result == -2)
11629 return NULL;
11630
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631 if (result < 0) {
11632 PyErr_SetString(PyExc_ValueError, "substring not found");
11633 return NULL;
11634 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011635
Christian Heimes217cfd12007-12-02 14:31:20 +000011636 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637}
11638
INADA Naoki3ae20562017-01-16 20:41:20 +090011639/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011640str.isascii as unicode_isascii
11641
11642Return True if all characters in the string are ASCII, False otherwise.
11643
11644ASCII characters have code points in the range U+0000-U+007F.
11645Empty string is ASCII too.
11646[clinic start generated code]*/
11647
11648static PyObject *
11649unicode_isascii_impl(PyObject *self)
11650/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11651{
11652 if (PyUnicode_READY(self) == -1) {
11653 return NULL;
11654 }
11655 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11656}
11657
11658/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011659str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660
INADA Naoki3ae20562017-01-16 20:41:20 +090011661Return True if the string is a lowercase string, False otherwise.
11662
11663A string is lowercase if all cased characters in the string are lowercase and
11664there is at least one cased character in the string.
11665[clinic start generated code]*/
11666
11667static PyObject *
11668unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011669/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011670{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011671 Py_ssize_t i, length;
11672 int kind;
11673 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011674 int cased;
11675
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011676 if (PyUnicode_READY(self) == -1)
11677 return NULL;
11678 length = PyUnicode_GET_LENGTH(self);
11679 kind = PyUnicode_KIND(self);
11680 data = PyUnicode_DATA(self);
11681
Guido van Rossumd57fd912000-03-10 22:53:23 +000011682 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011683 if (length == 1)
11684 return PyBool_FromLong(
11685 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011686
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011687 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011688 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011689 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011690
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011692 for (i = 0; i < length; i++) {
11693 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011694
Benjamin Peterson29060642009-01-31 22:14:21 +000011695 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011696 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011697 else if (!cased && Py_UNICODE_ISLOWER(ch))
11698 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011699 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011700 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011701}
11702
INADA Naoki3ae20562017-01-16 20:41:20 +090011703/*[clinic input]
11704str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011705
INADA Naoki3ae20562017-01-16 20:41:20 +090011706Return True if the string is an uppercase string, False otherwise.
11707
11708A string is uppercase if all cased characters in the string are uppercase and
11709there is at least one cased character in the string.
11710[clinic start generated code]*/
11711
11712static PyObject *
11713unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011714/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011715{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011716 Py_ssize_t i, length;
11717 int kind;
11718 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719 int cased;
11720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011721 if (PyUnicode_READY(self) == -1)
11722 return NULL;
11723 length = PyUnicode_GET_LENGTH(self);
11724 kind = PyUnicode_KIND(self);
11725 data = PyUnicode_DATA(self);
11726
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011728 if (length == 1)
11729 return PyBool_FromLong(
11730 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011732 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011733 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011734 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011735
Guido van Rossumd57fd912000-03-10 22:53:23 +000011736 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011737 for (i = 0; i < length; i++) {
11738 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011739
Benjamin Peterson29060642009-01-31 22:14:21 +000011740 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011741 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011742 else if (!cased && Py_UNICODE_ISUPPER(ch))
11743 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011745 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746}
11747
INADA Naoki3ae20562017-01-16 20:41:20 +090011748/*[clinic input]
11749str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750
INADA Naoki3ae20562017-01-16 20:41:20 +090011751Return True if the string is a title-cased string, False otherwise.
11752
11753In a title-cased string, upper- and title-case characters may only
11754follow uncased characters and lowercase characters only cased ones.
11755[clinic start generated code]*/
11756
11757static PyObject *
11758unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011759/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011761 Py_ssize_t i, length;
11762 int kind;
11763 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764 int cased, previous_is_cased;
11765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011766 if (PyUnicode_READY(self) == -1)
11767 return NULL;
11768 length = PyUnicode_GET_LENGTH(self);
11769 kind = PyUnicode_KIND(self);
11770 data = PyUnicode_DATA(self);
11771
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011773 if (length == 1) {
11774 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11775 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11776 (Py_UNICODE_ISUPPER(ch) != 0));
11777 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011779 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011780 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011781 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011782
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783 cased = 0;
11784 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 for (i = 0; i < length; i++) {
11786 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011787
Benjamin Peterson29060642009-01-31 22:14:21 +000011788 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11789 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011790 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011791 previous_is_cased = 1;
11792 cased = 1;
11793 }
11794 else if (Py_UNICODE_ISLOWER(ch)) {
11795 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011796 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011797 previous_is_cased = 1;
11798 cased = 1;
11799 }
11800 else
11801 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011803 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804}
11805
INADA Naoki3ae20562017-01-16 20:41:20 +090011806/*[clinic input]
11807str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808
INADA Naoki3ae20562017-01-16 20:41:20 +090011809Return True if the string is a whitespace string, False otherwise.
11810
11811A string is whitespace if all characters in the string are whitespace and there
11812is at least one character in the string.
11813[clinic start generated code]*/
11814
11815static PyObject *
11816unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011817/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011818{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011819 Py_ssize_t i, length;
11820 int kind;
11821 void *data;
11822
11823 if (PyUnicode_READY(self) == -1)
11824 return NULL;
11825 length = PyUnicode_GET_LENGTH(self);
11826 kind = PyUnicode_KIND(self);
11827 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011830 if (length == 1)
11831 return PyBool_FromLong(
11832 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011833
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011834 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011835 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011836 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011838 for (i = 0; i < length; i++) {
11839 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011840 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011841 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011843 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844}
11845
INADA Naoki3ae20562017-01-16 20:41:20 +090011846/*[clinic input]
11847str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011848
INADA Naoki3ae20562017-01-16 20:41:20 +090011849Return True if the string is an alphabetic string, False otherwise.
11850
11851A string is alphabetic if all characters in the string are alphabetic and there
11852is at least one character in the string.
11853[clinic start generated code]*/
11854
11855static PyObject *
11856unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011857/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011858{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011859 Py_ssize_t i, length;
11860 int kind;
11861 void *data;
11862
11863 if (PyUnicode_READY(self) == -1)
11864 return NULL;
11865 length = PyUnicode_GET_LENGTH(self);
11866 kind = PyUnicode_KIND(self);
11867 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011868
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011869 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011870 if (length == 1)
11871 return PyBool_FromLong(
11872 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011873
11874 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011875 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011876 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011877
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011878 for (i = 0; i < length; i++) {
11879 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011880 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011881 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011882 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011883}
11884
INADA Naoki3ae20562017-01-16 20:41:20 +090011885/*[clinic input]
11886str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011887
INADA Naoki3ae20562017-01-16 20:41:20 +090011888Return True if the string is an alpha-numeric string, False otherwise.
11889
11890A string is alpha-numeric if all characters in the string are alpha-numeric and
11891there is at least one character in the string.
11892[clinic start generated code]*/
11893
11894static PyObject *
11895unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011896/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011897{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011898 int kind;
11899 void *data;
11900 Py_ssize_t len, i;
11901
11902 if (PyUnicode_READY(self) == -1)
11903 return NULL;
11904
11905 kind = PyUnicode_KIND(self);
11906 data = PyUnicode_DATA(self);
11907 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011908
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011909 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011910 if (len == 1) {
11911 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11912 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11913 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011914
11915 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011916 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011917 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011918
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011919 for (i = 0; i < len; i++) {
11920 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011921 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011922 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011923 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011924 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011925}
11926
INADA Naoki3ae20562017-01-16 20:41:20 +090011927/*[clinic input]
11928str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929
INADA Naoki3ae20562017-01-16 20:41:20 +090011930Return True if the string is a decimal string, False otherwise.
11931
11932A string is a decimal string if all characters in the string are decimal and
11933there is at least one character in the string.
11934[clinic start generated code]*/
11935
11936static PyObject *
11937unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011938/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011939{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011940 Py_ssize_t i, length;
11941 int kind;
11942 void *data;
11943
11944 if (PyUnicode_READY(self) == -1)
11945 return NULL;
11946 length = PyUnicode_GET_LENGTH(self);
11947 kind = PyUnicode_KIND(self);
11948 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949
Guido van Rossumd57fd912000-03-10 22:53:23 +000011950 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011951 if (length == 1)
11952 return PyBool_FromLong(
11953 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011955 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011956 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011957 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011958
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011959 for (i = 0; i < length; i++) {
11960 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011961 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011963 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964}
11965
INADA Naoki3ae20562017-01-16 20:41:20 +090011966/*[clinic input]
11967str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968
INADA Naoki3ae20562017-01-16 20:41:20 +090011969Return True if the string is a digit string, False otherwise.
11970
11971A string is a digit string if all characters in the string are digits and there
11972is at least one character in the string.
11973[clinic start generated code]*/
11974
11975static PyObject *
11976unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011977/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011978{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011979 Py_ssize_t i, length;
11980 int kind;
11981 void *data;
11982
11983 if (PyUnicode_READY(self) == -1)
11984 return NULL;
11985 length = PyUnicode_GET_LENGTH(self);
11986 kind = PyUnicode_KIND(self);
11987 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990 if (length == 1) {
11991 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11992 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11993 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011995 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011997 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011999 for (i = 0; i < length; i++) {
12000 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012001 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012002 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012003 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012004}
12005
INADA Naoki3ae20562017-01-16 20:41:20 +090012006/*[clinic input]
12007str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012008
INADA Naoki3ae20562017-01-16 20:41:20 +090012009Return True if the string is a numeric string, False otherwise.
12010
12011A string is numeric if all characters in the string are numeric and there is at
12012least one character in the string.
12013[clinic start generated code]*/
12014
12015static PyObject *
12016unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012017/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012018{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012019 Py_ssize_t i, length;
12020 int kind;
12021 void *data;
12022
12023 if (PyUnicode_READY(self) == -1)
12024 return NULL;
12025 length = PyUnicode_GET_LENGTH(self);
12026 kind = PyUnicode_KIND(self);
12027 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012028
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012030 if (length == 1)
12031 return PyBool_FromLong(
12032 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012033
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012034 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012036 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012037
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012038 for (i = 0; i < length; i++) {
12039 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012040 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012041 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012042 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012043}
12044
Martin v. Löwis47383402007-08-15 07:32:56 +000012045int
12046PyUnicode_IsIdentifier(PyObject *self)
12047{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 int kind;
12049 void *data;
12050 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012051 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012052
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012053 if (PyUnicode_READY(self) == -1) {
12054 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012055 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012056 }
12057
12058 /* Special case for empty strings */
12059 if (PyUnicode_GET_LENGTH(self) == 0)
12060 return 0;
12061 kind = PyUnicode_KIND(self);
12062 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012063
12064 /* PEP 3131 says that the first character must be in
12065 XID_Start and subsequent characters in XID_Continue,
12066 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012067 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012068 letters, digits, underscore). However, given the current
12069 definition of XID_Start and XID_Continue, it is sufficient
12070 to check just for these, except that _ must be allowed
12071 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012072 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012073 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012074 return 0;
12075
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012076 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012077 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012078 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012079 return 1;
12080}
12081
INADA Naoki3ae20562017-01-16 20:41:20 +090012082/*[clinic input]
12083str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012084
INADA Naoki3ae20562017-01-16 20:41:20 +090012085Return True if the string is a valid Python identifier, False otherwise.
12086
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012087Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012088such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012089[clinic start generated code]*/
12090
12091static PyObject *
12092unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012093/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012094{
12095 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12096}
12097
INADA Naoki3ae20562017-01-16 20:41:20 +090012098/*[clinic input]
12099str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012100
INADA Naoki3ae20562017-01-16 20:41:20 +090012101Return True if the string is printable, False otherwise.
12102
12103A string is printable if all of its characters are considered printable in
12104repr() or if it is empty.
12105[clinic start generated code]*/
12106
12107static PyObject *
12108unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012109/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012110{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012111 Py_ssize_t i, length;
12112 int kind;
12113 void *data;
12114
12115 if (PyUnicode_READY(self) == -1)
12116 return NULL;
12117 length = PyUnicode_GET_LENGTH(self);
12118 kind = PyUnicode_KIND(self);
12119 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012120
12121 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012122 if (length == 1)
12123 return PyBool_FromLong(
12124 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012126 for (i = 0; i < length; i++) {
12127 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012128 Py_RETURN_FALSE;
12129 }
12130 }
12131 Py_RETURN_TRUE;
12132}
12133
INADA Naoki3ae20562017-01-16 20:41:20 +090012134/*[clinic input]
12135str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012136
INADA Naoki3ae20562017-01-16 20:41:20 +090012137 iterable: object
12138 /
12139
12140Concatenate any number of strings.
12141
Martin Panter91a88662017-01-24 00:30:06 +000012142The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012143The result is returned as a new string.
12144
12145Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12146[clinic start generated code]*/
12147
12148static PyObject *
12149unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012150/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012151{
INADA Naoki3ae20562017-01-16 20:41:20 +090012152 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012153}
12154
Martin v. Löwis18e16552006-02-15 17:27:45 +000012155static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012156unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012157{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012158 if (PyUnicode_READY(self) == -1)
12159 return -1;
12160 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012161}
12162
INADA Naoki3ae20562017-01-16 20:41:20 +090012163/*[clinic input]
12164str.ljust as unicode_ljust
12165
12166 width: Py_ssize_t
12167 fillchar: Py_UCS4 = ' '
12168 /
12169
12170Return a left-justified string of length width.
12171
12172Padding is done using the specified fill character (default is a space).
12173[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174
12175static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012176unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12177/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012178{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012179 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012180 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012181
Victor Stinnerc4b49542011-12-11 22:44:26 +010012182 if (PyUnicode_GET_LENGTH(self) >= width)
12183 return unicode_result_unchanged(self);
12184
12185 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012186}
12187
INADA Naoki3ae20562017-01-16 20:41:20 +090012188/*[clinic input]
12189str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012190
INADA Naoki3ae20562017-01-16 20:41:20 +090012191Return a copy of the string converted to lowercase.
12192[clinic start generated code]*/
12193
12194static PyObject *
12195unicode_lower_impl(PyObject *self)
12196/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012197{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012198 if (PyUnicode_READY(self) == -1)
12199 return NULL;
12200 if (PyUnicode_IS_ASCII(self))
12201 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012202 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203}
12204
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012205#define LEFTSTRIP 0
12206#define RIGHTSTRIP 1
12207#define BOTHSTRIP 2
12208
12209/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012210static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012211
INADA Naoki3ae20562017-01-16 20:41:20 +090012212#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012213
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012214/* externally visible for str.strip(unicode) */
12215PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012216_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012217{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012218 void *data;
12219 int kind;
12220 Py_ssize_t i, j, len;
12221 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012222 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012224 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12225 return NULL;
12226
12227 kind = PyUnicode_KIND(self);
12228 data = PyUnicode_DATA(self);
12229 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012230 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012231 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12232 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012233 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012234
Benjamin Peterson14339b62009-01-31 16:36:08 +000012235 i = 0;
12236 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012237 while (i < len) {
12238 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12239 if (!BLOOM(sepmask, ch))
12240 break;
12241 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12242 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012243 i++;
12244 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012245 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012246
Benjamin Peterson14339b62009-01-31 16:36:08 +000012247 j = len;
12248 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012249 j--;
12250 while (j >= i) {
12251 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12252 if (!BLOOM(sepmask, ch))
12253 break;
12254 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12255 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012256 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012257 }
12258
Benjamin Peterson29060642009-01-31 22:14:21 +000012259 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012260 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012261
Victor Stinner7931d9a2011-11-04 00:22:48 +010012262 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012263}
12264
12265PyObject*
12266PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12267{
12268 unsigned char *data;
12269 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012270 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012271
Victor Stinnerde636f32011-10-01 03:55:54 +020012272 if (PyUnicode_READY(self) == -1)
12273 return NULL;
12274
Victor Stinner684d5fd2012-05-03 02:32:34 +020012275 length = PyUnicode_GET_LENGTH(self);
12276 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012277
Victor Stinner684d5fd2012-05-03 02:32:34 +020012278 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012279 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012280
Victor Stinnerde636f32011-10-01 03:55:54 +020012281 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012282 PyErr_SetString(PyExc_IndexError, "string index out of range");
12283 return NULL;
12284 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012285 if (start >= length || end < start)
12286 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012287
Victor Stinner684d5fd2012-05-03 02:32:34 +020012288 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012289 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012290 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012291 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012292 }
12293 else {
12294 kind = PyUnicode_KIND(self);
12295 data = PyUnicode_1BYTE_DATA(self);
12296 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012297 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012298 length);
12299 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012300}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012301
12302static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012303do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012304{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012305 Py_ssize_t len, i, j;
12306
12307 if (PyUnicode_READY(self) == -1)
12308 return NULL;
12309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012311
Victor Stinnercc7af722013-04-09 22:39:24 +020012312 if (PyUnicode_IS_ASCII(self)) {
12313 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12314
12315 i = 0;
12316 if (striptype != RIGHTSTRIP) {
12317 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012318 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012319 if (!_Py_ascii_whitespace[ch])
12320 break;
12321 i++;
12322 }
12323 }
12324
12325 j = len;
12326 if (striptype != LEFTSTRIP) {
12327 j--;
12328 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012329 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012330 if (!_Py_ascii_whitespace[ch])
12331 break;
12332 j--;
12333 }
12334 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012335 }
12336 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012337 else {
12338 int kind = PyUnicode_KIND(self);
12339 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012340
Victor Stinnercc7af722013-04-09 22:39:24 +020012341 i = 0;
12342 if (striptype != RIGHTSTRIP) {
12343 while (i < len) {
12344 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12345 if (!Py_UNICODE_ISSPACE(ch))
12346 break;
12347 i++;
12348 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012349 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012350
12351 j = len;
12352 if (striptype != LEFTSTRIP) {
12353 j--;
12354 while (j >= i) {
12355 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12356 if (!Py_UNICODE_ISSPACE(ch))
12357 break;
12358 j--;
12359 }
12360 j++;
12361 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012362 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012363
Victor Stinner7931d9a2011-11-04 00:22:48 +010012364 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012365}
12366
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012367
12368static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012369do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012370{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012371 if (sep != NULL && sep != Py_None) {
12372 if (PyUnicode_Check(sep))
12373 return _PyUnicode_XStrip(self, striptype, sep);
12374 else {
12375 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012376 "%s arg must be None or str",
12377 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012378 return NULL;
12379 }
12380 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012381
Benjamin Peterson14339b62009-01-31 16:36:08 +000012382 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012383}
12384
12385
INADA Naoki3ae20562017-01-16 20:41:20 +090012386/*[clinic input]
12387str.strip as unicode_strip
12388
12389 chars: object = None
12390 /
12391
Victor Stinner0c4a8282017-01-17 02:21:47 +010012392Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012393
12394If chars is given and not None, remove characters in chars instead.
12395[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012396
12397static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012398unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012399/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012400{
INADA Naoki3ae20562017-01-16 20:41:20 +090012401 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012402}
12403
12404
INADA Naoki3ae20562017-01-16 20:41:20 +090012405/*[clinic input]
12406str.lstrip as unicode_lstrip
12407
12408 chars: object = NULL
12409 /
12410
12411Return a copy of the string with leading whitespace removed.
12412
12413If chars is given and not None, remove characters in chars instead.
12414[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012415
12416static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012417unicode_lstrip_impl(PyObject *self, PyObject *chars)
12418/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012419{
INADA Naoki3ae20562017-01-16 20:41:20 +090012420 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012421}
12422
12423
INADA Naoki3ae20562017-01-16 20:41:20 +090012424/*[clinic input]
12425str.rstrip as unicode_rstrip
12426
12427 chars: object = NULL
12428 /
12429
12430Return a copy of the string with trailing whitespace removed.
12431
12432If chars is given and not None, remove characters in chars instead.
12433[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012434
12435static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012436unicode_rstrip_impl(PyObject *self, PyObject *chars)
12437/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012438{
INADA Naoki3ae20562017-01-16 20:41:20 +090012439 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012440}
12441
12442
Guido van Rossumd57fd912000-03-10 22:53:23 +000012443static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012444unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012445{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012446 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012447 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012448
Serhiy Storchaka05997252013-01-26 12:14:02 +020012449 if (len < 1)
12450 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012451
Victor Stinnerc4b49542011-12-11 22:44:26 +010012452 /* no repeat, return original string */
12453 if (len == 1)
12454 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012455
Benjamin Petersonbac79492012-01-14 13:34:47 -050012456 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012457 return NULL;
12458
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012459 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012460 PyErr_SetString(PyExc_OverflowError,
12461 "repeated string is too long");
12462 return NULL;
12463 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012464 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012465
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012466 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012467 if (!u)
12468 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012469 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012471 if (PyUnicode_GET_LENGTH(str) == 1) {
12472 const int kind = PyUnicode_KIND(str);
12473 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012474 if (kind == PyUnicode_1BYTE_KIND) {
12475 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012476 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012477 }
12478 else if (kind == PyUnicode_2BYTE_KIND) {
12479 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012480 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012481 ucs2[n] = fill_char;
12482 } else {
12483 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12484 assert(kind == PyUnicode_4BYTE_KIND);
12485 for (n = 0; n < len; ++n)
12486 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012487 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012488 }
12489 else {
12490 /* number of characters copied this far */
12491 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012492 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012493 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012494 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012495 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012496 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012497 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012498 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012499 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012500 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012501 }
12502
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012503 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012504 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012505}
12506
Alexander Belopolsky40018472011-02-26 01:02:56 +000012507PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012508PyUnicode_Replace(PyObject *str,
12509 PyObject *substr,
12510 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012511 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012512{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012513 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12514 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012515 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012516 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012517}
12518
INADA Naoki3ae20562017-01-16 20:41:20 +090012519/*[clinic input]
12520str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521
INADA Naoki3ae20562017-01-16 20:41:20 +090012522 old: unicode
12523 new: unicode
12524 count: Py_ssize_t = -1
12525 Maximum number of occurrences to replace.
12526 -1 (the default value) means replace all occurrences.
12527 /
12528
12529Return a copy with all occurrences of substring old replaced by new.
12530
12531If the optional argument count is given, only the first count occurrences are
12532replaced.
12533[clinic start generated code]*/
12534
12535static PyObject *
12536unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12537 Py_ssize_t count)
12538/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012539{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012540 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012541 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012542 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012543}
12544
Alexander Belopolsky40018472011-02-26 01:02:56 +000012545static PyObject *
12546unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012547{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012548 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012549 Py_ssize_t isize;
12550 Py_ssize_t osize, squote, dquote, i, o;
12551 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012552 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012553 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012555 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012556 return NULL;
12557
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012558 isize = PyUnicode_GET_LENGTH(unicode);
12559 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012561 /* Compute length of output, quote characters, and
12562 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012563 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012564 max = 127;
12565 squote = dquote = 0;
12566 ikind = PyUnicode_KIND(unicode);
12567 for (i = 0; i < isize; i++) {
12568 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012569 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012570 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012571 case '\'': squote++; break;
12572 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012573 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012574 incr = 2;
12575 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012576 default:
12577 /* Fast-path ASCII */
12578 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012579 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012580 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012581 ;
12582 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012583 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012584 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012585 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012586 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012587 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012588 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012589 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012590 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012591 if (osize > PY_SSIZE_T_MAX - incr) {
12592 PyErr_SetString(PyExc_OverflowError,
12593 "string is too long to generate repr");
12594 return NULL;
12595 }
12596 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012597 }
12598
12599 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012600 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012601 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012602 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012603 if (dquote)
12604 /* Both squote and dquote present. Use squote,
12605 and escape them */
12606 osize += squote;
12607 else
12608 quote = '"';
12609 }
Victor Stinner55c08782013-04-14 18:45:39 +020012610 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012611
12612 repr = PyUnicode_New(osize, max);
12613 if (repr == NULL)
12614 return NULL;
12615 okind = PyUnicode_KIND(repr);
12616 odata = PyUnicode_DATA(repr);
12617
12618 PyUnicode_WRITE(okind, odata, 0, quote);
12619 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012620 if (unchanged) {
12621 _PyUnicode_FastCopyCharacters(repr, 1,
12622 unicode, 0,
12623 isize);
12624 }
12625 else {
12626 for (i = 0, o = 1; i < isize; i++) {
12627 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012628
Victor Stinner55c08782013-04-14 18:45:39 +020012629 /* Escape quotes and backslashes */
12630 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012631 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012632 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012633 continue;
12634 }
12635
12636 /* Map special whitespace to '\t', \n', '\r' */
12637 if (ch == '\t') {
12638 PyUnicode_WRITE(okind, odata, o++, '\\');
12639 PyUnicode_WRITE(okind, odata, o++, 't');
12640 }
12641 else if (ch == '\n') {
12642 PyUnicode_WRITE(okind, odata, o++, '\\');
12643 PyUnicode_WRITE(okind, odata, o++, 'n');
12644 }
12645 else if (ch == '\r') {
12646 PyUnicode_WRITE(okind, odata, o++, '\\');
12647 PyUnicode_WRITE(okind, odata, o++, 'r');
12648 }
12649
12650 /* Map non-printable US ASCII to '\xhh' */
12651 else if (ch < ' ' || ch == 0x7F) {
12652 PyUnicode_WRITE(okind, odata, o++, '\\');
12653 PyUnicode_WRITE(okind, odata, o++, 'x');
12654 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12655 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12656 }
12657
12658 /* Copy ASCII characters as-is */
12659 else if (ch < 0x7F) {
12660 PyUnicode_WRITE(okind, odata, o++, ch);
12661 }
12662
12663 /* Non-ASCII characters */
12664 else {
12665 /* Map Unicode whitespace and control characters
12666 (categories Z* and C* except ASCII space)
12667 */
12668 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12669 PyUnicode_WRITE(okind, odata, o++, '\\');
12670 /* Map 8-bit characters to '\xhh' */
12671 if (ch <= 0xff) {
12672 PyUnicode_WRITE(okind, odata, o++, 'x');
12673 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12674 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12675 }
12676 /* Map 16-bit characters to '\uxxxx' */
12677 else if (ch <= 0xffff) {
12678 PyUnicode_WRITE(okind, odata, o++, 'u');
12679 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12680 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12681 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12682 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12683 }
12684 /* Map 21-bit characters to '\U00xxxxxx' */
12685 else {
12686 PyUnicode_WRITE(okind, odata, o++, 'U');
12687 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12688 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12689 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12690 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12691 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12692 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12693 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12694 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12695 }
12696 }
12697 /* Copy characters as-is */
12698 else {
12699 PyUnicode_WRITE(okind, odata, o++, ch);
12700 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012701 }
12702 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012703 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012704 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012705 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012706 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012707}
12708
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012709PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012710 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012711\n\
12712Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012713such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012714arguments start and end are interpreted as in slice notation.\n\
12715\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012716Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012717
12718static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012719unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012720{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012721 /* initialize variables to prevent gcc warning */
12722 PyObject *substring = NULL;
12723 Py_ssize_t start = 0;
12724 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012725 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012726
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012727 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012728 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012729
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012730 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012731 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012732
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012733 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012735 if (result == -2)
12736 return NULL;
12737
Christian Heimes217cfd12007-12-02 14:31:20 +000012738 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012739}
12740
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012741PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012742 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012743\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012744Return the highest index in S where substring sub is found,\n\
12745such that sub is contained within S[start:end]. Optional\n\
12746arguments start and end are interpreted as in slice notation.\n\
12747\n\
12748Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012749
12750static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012751unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012752{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012753 /* initialize variables to prevent gcc warning */
12754 PyObject *substring = NULL;
12755 Py_ssize_t start = 0;
12756 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012757 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012758
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012759 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012760 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012761
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012762 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012763 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012764
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012765 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012766
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012767 if (result == -2)
12768 return NULL;
12769
Guido van Rossumd57fd912000-03-10 22:53:23 +000012770 if (result < 0) {
12771 PyErr_SetString(PyExc_ValueError, "substring not found");
12772 return NULL;
12773 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012774
Christian Heimes217cfd12007-12-02 14:31:20 +000012775 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012776}
12777
INADA Naoki3ae20562017-01-16 20:41:20 +090012778/*[clinic input]
12779str.rjust as unicode_rjust
12780
12781 width: Py_ssize_t
12782 fillchar: Py_UCS4 = ' '
12783 /
12784
12785Return a right-justified string of length width.
12786
12787Padding is done using the specified fill character (default is a space).
12788[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012789
12790static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012791unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12792/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012793{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012794 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012795 return NULL;
12796
Victor Stinnerc4b49542011-12-11 22:44:26 +010012797 if (PyUnicode_GET_LENGTH(self) >= width)
12798 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012799
Victor Stinnerc4b49542011-12-11 22:44:26 +010012800 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012801}
12802
Alexander Belopolsky40018472011-02-26 01:02:56 +000012803PyObject *
12804PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012805{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012806 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012807 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012808
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012809 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012810}
12811
INADA Naoki3ae20562017-01-16 20:41:20 +090012812/*[clinic input]
12813str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012814
INADA Naoki3ae20562017-01-16 20:41:20 +090012815 sep: object = None
12816 The delimiter according which to split the string.
12817 None (the default value) means split according to any whitespace,
12818 and discard empty strings from the result.
12819 maxsplit: Py_ssize_t = -1
12820 Maximum number of splits to do.
12821 -1 (the default value) means no limit.
12822
12823Return a list of the words in the string, using sep as the delimiter string.
12824[clinic start generated code]*/
12825
12826static PyObject *
12827unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12828/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012829{
INADA Naoki3ae20562017-01-16 20:41:20 +090012830 if (sep == Py_None)
12831 return split(self, NULL, maxsplit);
12832 if (PyUnicode_Check(sep))
12833 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012834
Victor Stinner998b8062018-09-12 00:23:25 +020012835 PyErr_Format(PyExc_TypeError,
12836 "must be str or None, not %.100s",
12837 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012838 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012839}
12840
Thomas Wouters477c8d52006-05-27 19:21:47 +000012841PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012842PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012843{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012844 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012845 int kind1, kind2;
12846 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012847 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012848
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012849 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012850 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012851
Victor Stinner14f8f022011-10-05 20:58:25 +020012852 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012853 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012854 len1 = PyUnicode_GET_LENGTH(str_obj);
12855 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012856 if (kind1 < kind2 || len1 < len2) {
12857 _Py_INCREF_UNICODE_EMPTY();
12858 if (!unicode_empty)
12859 out = NULL;
12860 else {
12861 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12862 Py_DECREF(unicode_empty);
12863 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012864 return out;
12865 }
12866 buf1 = PyUnicode_DATA(str_obj);
12867 buf2 = PyUnicode_DATA(sep_obj);
12868 if (kind2 != kind1) {
12869 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12870 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012871 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012872 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012873
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012874 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012875 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012876 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12877 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12878 else
12879 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012880 break;
12881 case PyUnicode_2BYTE_KIND:
12882 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12883 break;
12884 case PyUnicode_4BYTE_KIND:
12885 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12886 break;
12887 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012888 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012889 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012890
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012891 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012892 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012893
12894 return out;
12895}
12896
12897
12898PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012899PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012900{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012901 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012902 int kind1, kind2;
12903 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012904 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012905
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012906 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012907 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012908
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012909 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012910 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012911 len1 = PyUnicode_GET_LENGTH(str_obj);
12912 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012913 if (kind1 < kind2 || len1 < len2) {
12914 _Py_INCREF_UNICODE_EMPTY();
12915 if (!unicode_empty)
12916 out = NULL;
12917 else {
12918 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12919 Py_DECREF(unicode_empty);
12920 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012921 return out;
12922 }
12923 buf1 = PyUnicode_DATA(str_obj);
12924 buf2 = PyUnicode_DATA(sep_obj);
12925 if (kind2 != kind1) {
12926 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12927 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012928 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012929 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012930
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012931 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012932 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012933 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12934 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12935 else
12936 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012937 break;
12938 case PyUnicode_2BYTE_KIND:
12939 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12940 break;
12941 case PyUnicode_4BYTE_KIND:
12942 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12943 break;
12944 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012945 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012946 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012947
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012948 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012949 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012950
12951 return out;
12952}
12953
INADA Naoki3ae20562017-01-16 20:41:20 +090012954/*[clinic input]
12955str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000012956
INADA Naoki3ae20562017-01-16 20:41:20 +090012957 sep: object
12958 /
12959
12960Partition the string into three parts using the given separator.
12961
12962This will search for the separator in the string. If the separator is found,
12963returns a 3-tuple containing the part before the separator, the separator
12964itself, and the part after it.
12965
12966If the separator is not found, returns a 3-tuple containing the original string
12967and two empty strings.
12968[clinic start generated code]*/
12969
12970static PyObject *
12971unicode_partition(PyObject *self, PyObject *sep)
12972/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000012973{
INADA Naoki3ae20562017-01-16 20:41:20 +090012974 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012975}
12976
INADA Naoki3ae20562017-01-16 20:41:20 +090012977/*[clinic input]
12978str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000012979
INADA Naoki3ae20562017-01-16 20:41:20 +090012980Partition the string into three parts using the given separator.
12981
Serhiy Storchakaa2314282017-10-29 02:11:54 +030012982This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090012983the separator is found, returns a 3-tuple containing the part before the
12984separator, the separator itself, and the part after it.
12985
12986If the separator is not found, returns a 3-tuple containing two empty strings
12987and the original string.
12988[clinic start generated code]*/
12989
12990static PyObject *
12991unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030012992/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000012993{
INADA Naoki3ae20562017-01-16 20:41:20 +090012994 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012995}
12996
Alexander Belopolsky40018472011-02-26 01:02:56 +000012997PyObject *
12998PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012999{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013000 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013001 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013002
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013003 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013004}
13005
INADA Naoki3ae20562017-01-16 20:41:20 +090013006/*[clinic input]
13007str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013008
INADA Naoki3ae20562017-01-16 20:41:20 +090013009Return a list of the words in the string, using sep as the delimiter string.
13010
13011Splits are done starting at the end of the string and working to the front.
13012[clinic start generated code]*/
13013
13014static PyObject *
13015unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13016/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013017{
INADA Naoki3ae20562017-01-16 20:41:20 +090013018 if (sep == Py_None)
13019 return rsplit(self, NULL, maxsplit);
13020 if (PyUnicode_Check(sep))
13021 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013022
Victor Stinner998b8062018-09-12 00:23:25 +020013023 PyErr_Format(PyExc_TypeError,
13024 "must be str or None, not %.100s",
13025 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013026 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013027}
13028
INADA Naoki3ae20562017-01-16 20:41:20 +090013029/*[clinic input]
13030str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013031
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013032 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013033
13034Return a list of the lines in the string, breaking at line boundaries.
13035
13036Line breaks are not included in the resulting list unless keepends is given and
13037true.
13038[clinic start generated code]*/
13039
13040static PyObject *
13041unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013042/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013043{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013044 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013045}
13046
13047static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013048PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013049{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013050 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013051}
13052
INADA Naoki3ae20562017-01-16 20:41:20 +090013053/*[clinic input]
13054str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013055
INADA Naoki3ae20562017-01-16 20:41:20 +090013056Convert uppercase characters to lowercase and lowercase characters to uppercase.
13057[clinic start generated code]*/
13058
13059static PyObject *
13060unicode_swapcase_impl(PyObject *self)
13061/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013062{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013063 if (PyUnicode_READY(self) == -1)
13064 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013065 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013066}
13067
Larry Hastings61272b72014-01-07 12:41:53 -080013068/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013069
Larry Hastings31826802013-10-19 00:09:25 -070013070@staticmethod
13071str.maketrans as unicode_maketrans
13072
13073 x: object
13074
13075 y: unicode=NULL
13076
13077 z: unicode=NULL
13078
13079 /
13080
13081Return a translation table usable for str.translate().
13082
13083If there is only one argument, it must be a dictionary mapping Unicode
13084ordinals (integers) or characters to Unicode ordinals, strings or None.
13085Character keys will be then converted to ordinals.
13086If there are two arguments, they must be strings of equal length, and
13087in the resulting dictionary, each character in x will be mapped to the
13088character at the same position in y. If there is a third argument, it
13089must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013090[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013091
Larry Hastings31826802013-10-19 00:09:25 -070013092static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013093unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013094/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013095{
Georg Brandlceee0772007-11-27 23:48:05 +000013096 PyObject *new = NULL, *key, *value;
13097 Py_ssize_t i = 0;
13098 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013099
Georg Brandlceee0772007-11-27 23:48:05 +000013100 new = PyDict_New();
13101 if (!new)
13102 return NULL;
13103 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013104 int x_kind, y_kind, z_kind;
13105 void *x_data, *y_data, *z_data;
13106
Georg Brandlceee0772007-11-27 23:48:05 +000013107 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013108 if (!PyUnicode_Check(x)) {
13109 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13110 "be a string if there is a second argument");
13111 goto err;
13112 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013113 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013114 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13115 "arguments must have equal length");
13116 goto err;
13117 }
13118 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013119 x_kind = PyUnicode_KIND(x);
13120 y_kind = PyUnicode_KIND(y);
13121 x_data = PyUnicode_DATA(x);
13122 y_data = PyUnicode_DATA(y);
13123 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13124 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013125 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013126 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013127 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013128 if (!value) {
13129 Py_DECREF(key);
13130 goto err;
13131 }
Georg Brandlceee0772007-11-27 23:48:05 +000013132 res = PyDict_SetItem(new, key, value);
13133 Py_DECREF(key);
13134 Py_DECREF(value);
13135 if (res < 0)
13136 goto err;
13137 }
13138 /* create entries for deleting chars in z */
13139 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013140 z_kind = PyUnicode_KIND(z);
13141 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013142 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013143 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013144 if (!key)
13145 goto err;
13146 res = PyDict_SetItem(new, key, Py_None);
13147 Py_DECREF(key);
13148 if (res < 0)
13149 goto err;
13150 }
13151 }
13152 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013153 int kind;
13154 void *data;
13155
Georg Brandlceee0772007-11-27 23:48:05 +000013156 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013157 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013158 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13159 "to maketrans it must be a dict");
13160 goto err;
13161 }
13162 /* copy entries into the new dict, converting string keys to int keys */
13163 while (PyDict_Next(x, &i, &key, &value)) {
13164 if (PyUnicode_Check(key)) {
13165 /* convert string keys to integer keys */
13166 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013167 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013168 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13169 "table must be of length 1");
13170 goto err;
13171 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013172 kind = PyUnicode_KIND(key);
13173 data = PyUnicode_DATA(key);
13174 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013175 if (!newkey)
13176 goto err;
13177 res = PyDict_SetItem(new, newkey, value);
13178 Py_DECREF(newkey);
13179 if (res < 0)
13180 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013181 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013182 /* just keep integer keys */
13183 if (PyDict_SetItem(new, key, value) < 0)
13184 goto err;
13185 } else {
13186 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13187 "be strings or integers");
13188 goto err;
13189 }
13190 }
13191 }
13192 return new;
13193 err:
13194 Py_DECREF(new);
13195 return NULL;
13196}
13197
INADA Naoki3ae20562017-01-16 20:41:20 +090013198/*[clinic input]
13199str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013200
INADA Naoki3ae20562017-01-16 20:41:20 +090013201 table: object
13202 Translation table, which must be a mapping of Unicode ordinals to
13203 Unicode ordinals, strings, or None.
13204 /
13205
13206Replace each character in the string using the given translation table.
13207
13208The table must implement lookup/indexing via __getitem__, for instance a
13209dictionary or list. If this operation raises LookupError, the character is
13210left untouched. Characters mapped to None are deleted.
13211[clinic start generated code]*/
13212
13213static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013214unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013215/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013216{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013217 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013218}
13219
INADA Naoki3ae20562017-01-16 20:41:20 +090013220/*[clinic input]
13221str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013222
INADA Naoki3ae20562017-01-16 20:41:20 +090013223Return a copy of the string converted to uppercase.
13224[clinic start generated code]*/
13225
13226static PyObject *
13227unicode_upper_impl(PyObject *self)
13228/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013229{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013230 if (PyUnicode_READY(self) == -1)
13231 return NULL;
13232 if (PyUnicode_IS_ASCII(self))
13233 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013234 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013235}
13236
INADA Naoki3ae20562017-01-16 20:41:20 +090013237/*[clinic input]
13238str.zfill as unicode_zfill
13239
13240 width: Py_ssize_t
13241 /
13242
13243Pad a numeric string with zeros on the left, to fill a field of the given width.
13244
13245The string is never truncated.
13246[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013247
13248static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013249unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013250/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013251{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013252 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013253 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013254 int kind;
13255 void *data;
13256 Py_UCS4 chr;
13257
Benjamin Petersonbac79492012-01-14 13:34:47 -050013258 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013259 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013260
Victor Stinnerc4b49542011-12-11 22:44:26 +010013261 if (PyUnicode_GET_LENGTH(self) >= width)
13262 return unicode_result_unchanged(self);
13263
13264 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013265
13266 u = pad(self, fill, 0, '0');
13267
Walter Dörwald068325e2002-04-15 13:36:47 +000013268 if (u == NULL)
13269 return NULL;
13270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013271 kind = PyUnicode_KIND(u);
13272 data = PyUnicode_DATA(u);
13273 chr = PyUnicode_READ(kind, data, fill);
13274
13275 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013276 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013277 PyUnicode_WRITE(kind, data, 0, chr);
13278 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013279 }
13280
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013281 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013282 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013283}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013284
13285#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013286static PyObject *
13287unicode__decimal2ascii(PyObject *self)
13288{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013289 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013290}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013291#endif
13292
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013293PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013294 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013295\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013296Return True if S starts with the specified prefix, False otherwise.\n\
13297With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013298With optional end, stop comparing S at that position.\n\
13299prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013300
13301static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013302unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013303 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013304{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013305 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013306 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013307 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013308 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013309 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013310
Jesus Ceaac451502011-04-20 17:09:23 +020013311 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013312 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013313 if (PyTuple_Check(subobj)) {
13314 Py_ssize_t i;
13315 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013316 substring = PyTuple_GET_ITEM(subobj, i);
13317 if (!PyUnicode_Check(substring)) {
13318 PyErr_Format(PyExc_TypeError,
13319 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013320 "not %.100s",
13321 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013322 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013323 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013324 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013325 if (result == -1)
13326 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013327 if (result) {
13328 Py_RETURN_TRUE;
13329 }
13330 }
13331 /* nothing matched */
13332 Py_RETURN_FALSE;
13333 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013334 if (!PyUnicode_Check(subobj)) {
13335 PyErr_Format(PyExc_TypeError,
13336 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013337 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013338 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013339 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013340 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013341 if (result == -1)
13342 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013343 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013344}
13345
13346
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013347PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013348 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013349\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013350Return True if S ends with the specified suffix, False otherwise.\n\
13351With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013352With optional end, stop comparing S at that position.\n\
13353suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013354
13355static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013356unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013357 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013358{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013359 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013360 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013361 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013362 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013363 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013364
Jesus Ceaac451502011-04-20 17:09:23 +020013365 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013366 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013367 if (PyTuple_Check(subobj)) {
13368 Py_ssize_t i;
13369 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013370 substring = PyTuple_GET_ITEM(subobj, i);
13371 if (!PyUnicode_Check(substring)) {
13372 PyErr_Format(PyExc_TypeError,
13373 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013374 "not %.100s",
13375 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013376 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013377 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013378 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013379 if (result == -1)
13380 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013381 if (result) {
13382 Py_RETURN_TRUE;
13383 }
13384 }
13385 Py_RETURN_FALSE;
13386 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013387 if (!PyUnicode_Check(subobj)) {
13388 PyErr_Format(PyExc_TypeError,
13389 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013390 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013391 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013392 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013393 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013394 if (result == -1)
13395 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013396 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013397}
13398
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013399static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013400_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013401{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013402 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13403 writer->data = PyUnicode_DATA(writer->buffer);
13404
13405 if (!writer->readonly) {
13406 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013407 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013408 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013409 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013410 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13411 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13412 writer->kind = PyUnicode_WCHAR_KIND;
13413 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13414
Victor Stinner8f674cc2013-04-17 23:02:17 +020013415 /* Copy-on-write mode: set buffer size to 0 so
13416 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13417 * next write. */
13418 writer->size = 0;
13419 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013420}
13421
Victor Stinnerd3f08822012-05-29 12:57:52 +020013422void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013423_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013424{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013425 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013426
13427 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013428 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013429
13430 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13431 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13432 writer->kind = PyUnicode_WCHAR_KIND;
13433 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013434}
13435
Victor Stinnerd3f08822012-05-29 12:57:52 +020013436int
13437_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13438 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013439{
13440 Py_ssize_t newlen;
13441 PyObject *newbuffer;
13442
Victor Stinner2740e462016-09-06 16:58:36 -070013443 assert(maxchar <= MAX_UNICODE);
13444
Victor Stinnerca9381e2015-09-22 00:58:32 +020013445 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013446 assert((maxchar > writer->maxchar && length >= 0)
13447 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013448
Victor Stinner202fdca2012-05-07 12:47:02 +020013449 if (length > PY_SSIZE_T_MAX - writer->pos) {
13450 PyErr_NoMemory();
13451 return -1;
13452 }
13453 newlen = writer->pos + length;
13454
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013455 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013456
Victor Stinnerd3f08822012-05-29 12:57:52 +020013457 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013458 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013459 if (writer->overallocate
13460 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13461 /* overallocate to limit the number of realloc() */
13462 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013463 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013464 if (newlen < writer->min_length)
13465 newlen = writer->min_length;
13466
Victor Stinnerd3f08822012-05-29 12:57:52 +020013467 writer->buffer = PyUnicode_New(newlen, maxchar);
13468 if (writer->buffer == NULL)
13469 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013470 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013471 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013472 if (writer->overallocate
13473 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13474 /* overallocate to limit the number of realloc() */
13475 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013476 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013477 if (newlen < writer->min_length)
13478 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013479
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013480 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013481 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013482 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013483 newbuffer = PyUnicode_New(newlen, maxchar);
13484 if (newbuffer == NULL)
13485 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013486 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13487 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013488 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013489 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013490 }
13491 else {
13492 newbuffer = resize_compact(writer->buffer, newlen);
13493 if (newbuffer == NULL)
13494 return -1;
13495 }
13496 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013497 }
13498 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013499 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013500 newbuffer = PyUnicode_New(writer->size, maxchar);
13501 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013502 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013503 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13504 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013505 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013506 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013507 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013508 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013509
13510#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013511}
13512
Victor Stinnerca9381e2015-09-22 00:58:32 +020013513int
13514_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13515 enum PyUnicode_Kind kind)
13516{
13517 Py_UCS4 maxchar;
13518
13519 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13520 assert(writer->kind < kind);
13521
13522 switch (kind)
13523 {
13524 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13525 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13526 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13527 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013528 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013529 }
13530
13531 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13532}
13533
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013534static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013535_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013536{
Victor Stinner2740e462016-09-06 16:58:36 -070013537 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013538 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13539 return -1;
13540 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13541 writer->pos++;
13542 return 0;
13543}
13544
13545int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013546_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13547{
13548 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13549}
13550
13551int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013552_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13553{
13554 Py_UCS4 maxchar;
13555 Py_ssize_t len;
13556
13557 if (PyUnicode_READY(str) == -1)
13558 return -1;
13559 len = PyUnicode_GET_LENGTH(str);
13560 if (len == 0)
13561 return 0;
13562 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13563 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013564 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013565 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013566 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013567 Py_INCREF(str);
13568 writer->buffer = str;
13569 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013570 writer->pos += len;
13571 return 0;
13572 }
13573 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13574 return -1;
13575 }
13576 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13577 str, 0, len);
13578 writer->pos += len;
13579 return 0;
13580}
13581
Victor Stinnere215d962012-10-06 23:03:36 +020013582int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013583_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13584 Py_ssize_t start, Py_ssize_t end)
13585{
13586 Py_UCS4 maxchar;
13587 Py_ssize_t len;
13588
13589 if (PyUnicode_READY(str) == -1)
13590 return -1;
13591
13592 assert(0 <= start);
13593 assert(end <= PyUnicode_GET_LENGTH(str));
13594 assert(start <= end);
13595
13596 if (end == 0)
13597 return 0;
13598
13599 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13600 return _PyUnicodeWriter_WriteStr(writer, str);
13601
13602 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13603 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13604 else
13605 maxchar = writer->maxchar;
13606 len = end - start;
13607
13608 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13609 return -1;
13610
13611 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13612 str, start, len);
13613 writer->pos += len;
13614 return 0;
13615}
13616
13617int
Victor Stinner4a587072013-11-19 12:54:53 +010013618_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13619 const char *ascii, Py_ssize_t len)
13620{
13621 if (len == -1)
13622 len = strlen(ascii);
13623
13624 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13625
13626 if (writer->buffer == NULL && !writer->overallocate) {
13627 PyObject *str;
13628
13629 str = _PyUnicode_FromASCII(ascii, len);
13630 if (str == NULL)
13631 return -1;
13632
13633 writer->readonly = 1;
13634 writer->buffer = str;
13635 _PyUnicodeWriter_Update(writer);
13636 writer->pos += len;
13637 return 0;
13638 }
13639
13640 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13641 return -1;
13642
13643 switch (writer->kind)
13644 {
13645 case PyUnicode_1BYTE_KIND:
13646 {
13647 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13648 Py_UCS1 *data = writer->data;
13649
Christian Heimesf051e432016-09-13 20:22:02 +020013650 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013651 break;
13652 }
13653 case PyUnicode_2BYTE_KIND:
13654 {
13655 _PyUnicode_CONVERT_BYTES(
13656 Py_UCS1, Py_UCS2,
13657 ascii, ascii + len,
13658 (Py_UCS2 *)writer->data + writer->pos);
13659 break;
13660 }
13661 case PyUnicode_4BYTE_KIND:
13662 {
13663 _PyUnicode_CONVERT_BYTES(
13664 Py_UCS1, Py_UCS4,
13665 ascii, ascii + len,
13666 (Py_UCS4 *)writer->data + writer->pos);
13667 break;
13668 }
13669 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013670 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013671 }
13672
13673 writer->pos += len;
13674 return 0;
13675}
13676
13677int
13678_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13679 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013680{
13681 Py_UCS4 maxchar;
13682
13683 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13684 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13685 return -1;
13686 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13687 writer->pos += len;
13688 return 0;
13689}
13690
Victor Stinnerd3f08822012-05-29 12:57:52 +020013691PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013692_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013693{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013694 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013695
Victor Stinnerd3f08822012-05-29 12:57:52 +020013696 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013697 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013698 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013699 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013700
13701 str = writer->buffer;
13702 writer->buffer = NULL;
13703
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013704 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013705 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13706 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013707 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013708
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013709 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13710 PyObject *str2;
13711 str2 = resize_compact(str, writer->pos);
13712 if (str2 == NULL) {
13713 Py_DECREF(str);
13714 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013715 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013716 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013717 }
13718
Victor Stinner15a0bd32013-07-08 22:29:55 +020013719 assert(_PyUnicode_CheckConsistency(str, 1));
13720 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013721}
13722
Victor Stinnerd3f08822012-05-29 12:57:52 +020013723void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013724_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013725{
13726 Py_CLEAR(writer->buffer);
13727}
13728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013729#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013730
13731PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013732 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013733\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013734Return a formatted version of S, using substitutions from args and kwargs.\n\
13735The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013736
Eric Smith27bbca62010-11-04 17:06:58 +000013737PyDoc_STRVAR(format_map__doc__,
13738 "S.format_map(mapping) -> str\n\
13739\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013740Return a formatted version of S, using substitutions from mapping.\n\
13741The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013742
INADA Naoki3ae20562017-01-16 20:41:20 +090013743/*[clinic input]
13744str.__format__ as unicode___format__
13745
13746 format_spec: unicode
13747 /
13748
13749Return a formatted version of the string as described by format_spec.
13750[clinic start generated code]*/
13751
Eric Smith4a7d76d2008-05-30 18:10:19 +000013752static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013753unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013754/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013755{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013756 _PyUnicodeWriter writer;
13757 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013758
Victor Stinnerd3f08822012-05-29 12:57:52 +020013759 if (PyUnicode_READY(self) == -1)
13760 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013761 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013762 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13763 self, format_spec, 0,
13764 PyUnicode_GET_LENGTH(format_spec));
13765 if (ret == -1) {
13766 _PyUnicodeWriter_Dealloc(&writer);
13767 return NULL;
13768 }
13769 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013770}
13771
INADA Naoki3ae20562017-01-16 20:41:20 +090013772/*[clinic input]
13773str.__sizeof__ as unicode_sizeof
13774
13775Return the size of the string in memory, in bytes.
13776[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013777
13778static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013779unicode_sizeof_impl(PyObject *self)
13780/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013781{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013782 Py_ssize_t size;
13783
13784 /* If it's a compact object, account for base structure +
13785 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013786 if (PyUnicode_IS_COMPACT_ASCII(self))
13787 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13788 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013789 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013790 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013791 else {
13792 /* If it is a two-block object, account for base object, and
13793 for character block if present. */
13794 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013795 if (_PyUnicode_DATA_ANY(self))
13796 size += (PyUnicode_GET_LENGTH(self) + 1) *
13797 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013798 }
13799 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013800 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013801 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13802 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13803 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13804 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013805
13806 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013807}
13808
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013809static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013810unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013811{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013812 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013813 if (!copy)
13814 return NULL;
13815 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013816}
13817
Guido van Rossumd57fd912000-03-10 22:53:23 +000013818static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013819 UNICODE_ENCODE_METHODDEF
13820 UNICODE_REPLACE_METHODDEF
13821 UNICODE_SPLIT_METHODDEF
13822 UNICODE_RSPLIT_METHODDEF
13823 UNICODE_JOIN_METHODDEF
13824 UNICODE_CAPITALIZE_METHODDEF
13825 UNICODE_CASEFOLD_METHODDEF
13826 UNICODE_TITLE_METHODDEF
13827 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013828 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013829 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013830 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013831 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013832 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013833 UNICODE_LJUST_METHODDEF
13834 UNICODE_LOWER_METHODDEF
13835 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013836 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13837 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013838 UNICODE_RJUST_METHODDEF
13839 UNICODE_RSTRIP_METHODDEF
13840 UNICODE_RPARTITION_METHODDEF
13841 UNICODE_SPLITLINES_METHODDEF
13842 UNICODE_STRIP_METHODDEF
13843 UNICODE_SWAPCASE_METHODDEF
13844 UNICODE_TRANSLATE_METHODDEF
13845 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013846 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13847 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090013848 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013849 UNICODE_ISLOWER_METHODDEF
13850 UNICODE_ISUPPER_METHODDEF
13851 UNICODE_ISTITLE_METHODDEF
13852 UNICODE_ISSPACE_METHODDEF
13853 UNICODE_ISDECIMAL_METHODDEF
13854 UNICODE_ISDIGIT_METHODDEF
13855 UNICODE_ISNUMERIC_METHODDEF
13856 UNICODE_ISALPHA_METHODDEF
13857 UNICODE_ISALNUM_METHODDEF
13858 UNICODE_ISIDENTIFIER_METHODDEF
13859 UNICODE_ISPRINTABLE_METHODDEF
13860 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020013861 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013862 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013863 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013864 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013865 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013866#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013867 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013868 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013869#endif
13870
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013871 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013872 {NULL, NULL}
13873};
13874
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013875static PyObject *
13876unicode_mod(PyObject *v, PyObject *w)
13877{
Brian Curtindfc80e32011-08-10 20:28:54 -050013878 if (!PyUnicode_Check(v))
13879 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013880 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013881}
13882
13883static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013884 0, /*nb_add*/
13885 0, /*nb_subtract*/
13886 0, /*nb_multiply*/
13887 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013888};
13889
Guido van Rossumd57fd912000-03-10 22:53:23 +000013890static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013891 (lenfunc) unicode_length, /* sq_length */
13892 PyUnicode_Concat, /* sq_concat */
13893 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13894 (ssizeargfunc) unicode_getitem, /* sq_item */
13895 0, /* sq_slice */
13896 0, /* sq_ass_item */
13897 0, /* sq_ass_slice */
13898 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013899};
13900
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013901static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013902unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013903{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013904 if (PyUnicode_READY(self) == -1)
13905 return NULL;
13906
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013907 if (PyIndex_Check(item)) {
13908 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013909 if (i == -1 && PyErr_Occurred())
13910 return NULL;
13911 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013912 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013913 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013914 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013915 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013916 PyObject *result;
13917 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013918 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013919 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013920
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013921 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013922 return NULL;
13923 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013924 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13925 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013926
13927 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013928 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013929 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013930 slicelength == PyUnicode_GET_LENGTH(self)) {
13931 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013932 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013933 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013934 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013935 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013936 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013937 src_kind = PyUnicode_KIND(self);
13938 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013939 if (!PyUnicode_IS_ASCII(self)) {
13940 kind_limit = kind_maxchar_limit(src_kind);
13941 max_char = 0;
13942 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13943 ch = PyUnicode_READ(src_kind, src_data, cur);
13944 if (ch > max_char) {
13945 max_char = ch;
13946 if (max_char >= kind_limit)
13947 break;
13948 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013949 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013950 }
Victor Stinner55c99112011-10-13 01:17:06 +020013951 else
13952 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013953 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013954 if (result == NULL)
13955 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013956 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013957 dest_data = PyUnicode_DATA(result);
13958
13959 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013960 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13961 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013962 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013963 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013964 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013965 } else {
13966 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13967 return NULL;
13968 }
13969}
13970
13971static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013972 (lenfunc)unicode_length, /* mp_length */
13973 (binaryfunc)unicode_subscript, /* mp_subscript */
13974 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013975};
13976
Guido van Rossumd57fd912000-03-10 22:53:23 +000013977
Guido van Rossumd57fd912000-03-10 22:53:23 +000013978/* Helpers for PyUnicode_Format() */
13979
Victor Stinnera47082312012-10-04 02:19:54 +020013980struct unicode_formatter_t {
13981 PyObject *args;
13982 int args_owned;
13983 Py_ssize_t arglen, argidx;
13984 PyObject *dict;
13985
13986 enum PyUnicode_Kind fmtkind;
13987 Py_ssize_t fmtcnt, fmtpos;
13988 void *fmtdata;
13989 PyObject *fmtstr;
13990
13991 _PyUnicodeWriter writer;
13992};
13993
13994struct unicode_format_arg_t {
13995 Py_UCS4 ch;
13996 int flags;
13997 Py_ssize_t width;
13998 int prec;
13999 int sign;
14000};
14001
Guido van Rossumd57fd912000-03-10 22:53:23 +000014002static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014003unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014004{
Victor Stinnera47082312012-10-04 02:19:54 +020014005 Py_ssize_t argidx = ctx->argidx;
14006
14007 if (argidx < ctx->arglen) {
14008 ctx->argidx++;
14009 if (ctx->arglen < 0)
14010 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014011 else
Victor Stinnera47082312012-10-04 02:19:54 +020014012 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014013 }
14014 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014015 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014016 return NULL;
14017}
14018
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014019/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014020
Victor Stinnera47082312012-10-04 02:19:54 +020014021/* Format a float into the writer if the writer is not NULL, or into *p_output
14022 otherwise.
14023
14024 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014025static int
Victor Stinnera47082312012-10-04 02:19:54 +020014026formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14027 PyObject **p_output,
14028 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014029{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014030 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014031 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014032 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014033 int prec;
14034 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014035
Guido van Rossumd57fd912000-03-10 22:53:23 +000014036 x = PyFloat_AsDouble(v);
14037 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014038 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014039
Victor Stinnera47082312012-10-04 02:19:54 +020014040 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014041 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014042 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014043
Victor Stinnera47082312012-10-04 02:19:54 +020014044 if (arg->flags & F_ALT)
14045 dtoa_flags = Py_DTSF_ALT;
14046 else
14047 dtoa_flags = 0;
14048 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014049 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014050 return -1;
14051 len = strlen(p);
14052 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014053 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014054 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014055 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014056 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014057 }
14058 else
14059 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014060 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014061 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014062}
14063
Victor Stinnerd0880d52012-04-27 23:40:13 +020014064/* formatlong() emulates the format codes d, u, o, x and X, and
14065 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14066 * Python's regular ints.
14067 * Return value: a new PyUnicodeObject*, or NULL if error.
14068 * The output string is of the form
14069 * "-"? ("0x" | "0X")? digit+
14070 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14071 * set in flags. The case of hex digits will be correct,
14072 * There will be at least prec digits, zero-filled on the left if
14073 * necessary to get that many.
14074 * val object to be converted
14075 * flags bitmask of format flags; only F_ALT is looked at
14076 * prec minimum number of digits; 0-fill on left if needed
14077 * type a character in [duoxX]; u acts the same as d
14078 *
14079 * CAUTION: o, x and X conversions on regular ints can never
14080 * produce a '-' sign, but can for Python's unbounded ints.
14081 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014082PyObject *
14083_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014084{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014085 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014086 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014087 Py_ssize_t i;
14088 int sign; /* 1 if '-', else 0 */
14089 int len; /* number of characters */
14090 Py_ssize_t llen;
14091 int numdigits; /* len == numnondigits + numdigits */
14092 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014093
Victor Stinnerd0880d52012-04-27 23:40:13 +020014094 /* Avoid exceeding SSIZE_T_MAX */
14095 if (prec > INT_MAX-3) {
14096 PyErr_SetString(PyExc_OverflowError,
14097 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014098 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014099 }
14100
14101 assert(PyLong_Check(val));
14102
14103 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014104 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014105 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014106 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014107 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014108 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014109 /* int and int subclasses should print numerically when a numeric */
14110 /* format code is used (see issue18780) */
14111 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014112 break;
14113 case 'o':
14114 numnondigits = 2;
14115 result = PyNumber_ToBase(val, 8);
14116 break;
14117 case 'x':
14118 case 'X':
14119 numnondigits = 2;
14120 result = PyNumber_ToBase(val, 16);
14121 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014122 }
14123 if (!result)
14124 return NULL;
14125
14126 assert(unicode_modifiable(result));
14127 assert(PyUnicode_IS_READY(result));
14128 assert(PyUnicode_IS_ASCII(result));
14129
14130 /* To modify the string in-place, there can only be one reference. */
14131 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014132 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014133 PyErr_BadInternalCall();
14134 return NULL;
14135 }
14136 buf = PyUnicode_DATA(result);
14137 llen = PyUnicode_GET_LENGTH(result);
14138 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014139 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014140 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014141 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014142 return NULL;
14143 }
14144 len = (int)llen;
14145 sign = buf[0] == '-';
14146 numnondigits += sign;
14147 numdigits = len - numnondigits;
14148 assert(numdigits > 0);
14149
14150 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014151 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014152 (type == 'o' || type == 'x' || type == 'X'))) {
14153 assert(buf[sign] == '0');
14154 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14155 buf[sign+1] == 'o');
14156 numnondigits -= 2;
14157 buf += 2;
14158 len -= 2;
14159 if (sign)
14160 buf[0] = '-';
14161 assert(len == numnondigits + numdigits);
14162 assert(numdigits > 0);
14163 }
14164
14165 /* Fill with leading zeroes to meet minimum width. */
14166 if (prec > numdigits) {
14167 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14168 numnondigits + prec);
14169 char *b1;
14170 if (!r1) {
14171 Py_DECREF(result);
14172 return NULL;
14173 }
14174 b1 = PyBytes_AS_STRING(r1);
14175 for (i = 0; i < numnondigits; ++i)
14176 *b1++ = *buf++;
14177 for (i = 0; i < prec - numdigits; i++)
14178 *b1++ = '0';
14179 for (i = 0; i < numdigits; i++)
14180 *b1++ = *buf++;
14181 *b1 = '\0';
14182 Py_DECREF(result);
14183 result = r1;
14184 buf = PyBytes_AS_STRING(result);
14185 len = numnondigits + prec;
14186 }
14187
14188 /* Fix up case for hex conversions. */
14189 if (type == 'X') {
14190 /* Need to convert all lower case letters to upper case.
14191 and need to convert 0x to 0X (and -0x to -0X). */
14192 for (i = 0; i < len; i++)
14193 if (buf[i] >= 'a' && buf[i] <= 'x')
14194 buf[i] -= 'a'-'A';
14195 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014196 if (!PyUnicode_Check(result)
14197 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014198 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014199 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014200 Py_DECREF(result);
14201 result = unicode;
14202 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014203 else if (len != PyUnicode_GET_LENGTH(result)) {
14204 if (PyUnicode_Resize(&result, len) < 0)
14205 Py_CLEAR(result);
14206 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014207 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014208}
14209
Ethan Furmandf3ed242014-01-05 06:50:30 -080014210/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014211 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014212 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014213 * -1 and raise an exception on error */
14214static int
Victor Stinnera47082312012-10-04 02:19:54 +020014215mainformatlong(PyObject *v,
14216 struct unicode_format_arg_t *arg,
14217 PyObject **p_output,
14218 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014219{
14220 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014221 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014222
14223 if (!PyNumber_Check(v))
14224 goto wrongtype;
14225
Ethan Furman9ab74802014-03-21 06:38:46 -070014226 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014227 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014228 if (type == 'o' || type == 'x' || type == 'X') {
14229 iobj = PyNumber_Index(v);
14230 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014231 if (PyErr_ExceptionMatches(PyExc_TypeError))
14232 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014233 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014234 }
14235 }
14236 else {
14237 iobj = PyNumber_Long(v);
14238 if (iobj == NULL ) {
14239 if (PyErr_ExceptionMatches(PyExc_TypeError))
14240 goto wrongtype;
14241 return -1;
14242 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014243 }
14244 assert(PyLong_Check(iobj));
14245 }
14246 else {
14247 iobj = v;
14248 Py_INCREF(iobj);
14249 }
14250
14251 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014252 && arg->width == -1 && arg->prec == -1
14253 && !(arg->flags & (F_SIGN | F_BLANK))
14254 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014255 {
14256 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014257 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014258 int base;
14259
Victor Stinnera47082312012-10-04 02:19:54 +020014260 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014261 {
14262 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014263 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014264 case 'd':
14265 case 'i':
14266 case 'u':
14267 base = 10;
14268 break;
14269 case 'o':
14270 base = 8;
14271 break;
14272 case 'x':
14273 case 'X':
14274 base = 16;
14275 break;
14276 }
14277
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014278 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14279 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014280 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014281 }
14282 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014283 return 1;
14284 }
14285
Ethan Furmanb95b5612015-01-23 20:05:18 -080014286 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014287 Py_DECREF(iobj);
14288 if (res == NULL)
14289 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014290 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014291 return 0;
14292
14293wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014294 switch(type)
14295 {
14296 case 'o':
14297 case 'x':
14298 case 'X':
14299 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014300 "%%%c format: an integer is required, "
14301 "not %.200s",
14302 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014303 break;
14304 default:
14305 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014306 "%%%c format: a number is required, "
14307 "not %.200s",
14308 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014309 break;
14310 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014311 return -1;
14312}
14313
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014314static Py_UCS4
14315formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014316{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014317 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014318 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014319 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014320 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014321 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014322 goto onError;
14323 }
14324 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014325 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014326 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014327 /* make sure number is a type of integer */
14328 if (!PyLong_Check(v)) {
14329 iobj = PyNumber_Index(v);
14330 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014331 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014332 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014333 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014334 Py_DECREF(iobj);
14335 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014336 else {
14337 x = PyLong_AsLong(v);
14338 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014339 if (x == -1 && PyErr_Occurred())
14340 goto onError;
14341
Victor Stinner8faf8212011-12-08 22:14:11 +010014342 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014343 PyErr_SetString(PyExc_OverflowError,
14344 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014345 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014346 }
14347
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014348 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014349 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014350
Benjamin Peterson29060642009-01-31 22:14:21 +000014351 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014352 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014353 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014354 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014355}
14356
Victor Stinnera47082312012-10-04 02:19:54 +020014357/* Parse options of an argument: flags, width, precision.
14358 Handle also "%(name)" syntax.
14359
14360 Return 0 if the argument has been formatted into arg->str.
14361 Return 1 if the argument has been written into ctx->writer,
14362 Raise an exception and return -1 on error. */
14363static int
14364unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14365 struct unicode_format_arg_t *arg)
14366{
14367#define FORMAT_READ(ctx) \
14368 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14369
14370 PyObject *v;
14371
Victor Stinnera47082312012-10-04 02:19:54 +020014372 if (arg->ch == '(') {
14373 /* Get argument value from a dictionary. Example: "%(name)s". */
14374 Py_ssize_t keystart;
14375 Py_ssize_t keylen;
14376 PyObject *key;
14377 int pcount = 1;
14378
14379 if (ctx->dict == NULL) {
14380 PyErr_SetString(PyExc_TypeError,
14381 "format requires a mapping");
14382 return -1;
14383 }
14384 ++ctx->fmtpos;
14385 --ctx->fmtcnt;
14386 keystart = ctx->fmtpos;
14387 /* Skip over balanced parentheses */
14388 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14389 arg->ch = FORMAT_READ(ctx);
14390 if (arg->ch == ')')
14391 --pcount;
14392 else if (arg->ch == '(')
14393 ++pcount;
14394 ctx->fmtpos++;
14395 }
14396 keylen = ctx->fmtpos - keystart - 1;
14397 if (ctx->fmtcnt < 0 || pcount > 0) {
14398 PyErr_SetString(PyExc_ValueError,
14399 "incomplete format key");
14400 return -1;
14401 }
14402 key = PyUnicode_Substring(ctx->fmtstr,
14403 keystart, keystart + keylen);
14404 if (key == NULL)
14405 return -1;
14406 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014407 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014408 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014409 }
14410 ctx->args = PyObject_GetItem(ctx->dict, key);
14411 Py_DECREF(key);
14412 if (ctx->args == NULL)
14413 return -1;
14414 ctx->args_owned = 1;
14415 ctx->arglen = -1;
14416 ctx->argidx = -2;
14417 }
14418
14419 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014420 while (--ctx->fmtcnt >= 0) {
14421 arg->ch = FORMAT_READ(ctx);
14422 ctx->fmtpos++;
14423 switch (arg->ch) {
14424 case '-': arg->flags |= F_LJUST; continue;
14425 case '+': arg->flags |= F_SIGN; continue;
14426 case ' ': arg->flags |= F_BLANK; continue;
14427 case '#': arg->flags |= F_ALT; continue;
14428 case '0': arg->flags |= F_ZERO; continue;
14429 }
14430 break;
14431 }
14432
14433 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014434 if (arg->ch == '*') {
14435 v = unicode_format_getnextarg(ctx);
14436 if (v == NULL)
14437 return -1;
14438 if (!PyLong_Check(v)) {
14439 PyErr_SetString(PyExc_TypeError,
14440 "* wants int");
14441 return -1;
14442 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014443 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014444 if (arg->width == -1 && PyErr_Occurred())
14445 return -1;
14446 if (arg->width < 0) {
14447 arg->flags |= F_LJUST;
14448 arg->width = -arg->width;
14449 }
14450 if (--ctx->fmtcnt >= 0) {
14451 arg->ch = FORMAT_READ(ctx);
14452 ctx->fmtpos++;
14453 }
14454 }
14455 else if (arg->ch >= '0' && arg->ch <= '9') {
14456 arg->width = arg->ch - '0';
14457 while (--ctx->fmtcnt >= 0) {
14458 arg->ch = FORMAT_READ(ctx);
14459 ctx->fmtpos++;
14460 if (arg->ch < '0' || arg->ch > '9')
14461 break;
14462 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14463 mixing signed and unsigned comparison. Since arg->ch is between
14464 '0' and '9', casting to int is safe. */
14465 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14466 PyErr_SetString(PyExc_ValueError,
14467 "width too big");
14468 return -1;
14469 }
14470 arg->width = arg->width*10 + (arg->ch - '0');
14471 }
14472 }
14473
14474 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014475 if (arg->ch == '.') {
14476 arg->prec = 0;
14477 if (--ctx->fmtcnt >= 0) {
14478 arg->ch = FORMAT_READ(ctx);
14479 ctx->fmtpos++;
14480 }
14481 if (arg->ch == '*') {
14482 v = unicode_format_getnextarg(ctx);
14483 if (v == NULL)
14484 return -1;
14485 if (!PyLong_Check(v)) {
14486 PyErr_SetString(PyExc_TypeError,
14487 "* wants int");
14488 return -1;
14489 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014490 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014491 if (arg->prec == -1 && PyErr_Occurred())
14492 return -1;
14493 if (arg->prec < 0)
14494 arg->prec = 0;
14495 if (--ctx->fmtcnt >= 0) {
14496 arg->ch = FORMAT_READ(ctx);
14497 ctx->fmtpos++;
14498 }
14499 }
14500 else if (arg->ch >= '0' && arg->ch <= '9') {
14501 arg->prec = arg->ch - '0';
14502 while (--ctx->fmtcnt >= 0) {
14503 arg->ch = FORMAT_READ(ctx);
14504 ctx->fmtpos++;
14505 if (arg->ch < '0' || arg->ch > '9')
14506 break;
14507 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14508 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014509 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014510 return -1;
14511 }
14512 arg->prec = arg->prec*10 + (arg->ch - '0');
14513 }
14514 }
14515 }
14516
14517 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14518 if (ctx->fmtcnt >= 0) {
14519 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14520 if (--ctx->fmtcnt >= 0) {
14521 arg->ch = FORMAT_READ(ctx);
14522 ctx->fmtpos++;
14523 }
14524 }
14525 }
14526 if (ctx->fmtcnt < 0) {
14527 PyErr_SetString(PyExc_ValueError,
14528 "incomplete format");
14529 return -1;
14530 }
14531 return 0;
14532
14533#undef FORMAT_READ
14534}
14535
14536/* Format one argument. Supported conversion specifiers:
14537
14538 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014539 - "i", "d", "u": int or float
14540 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014541 - "e", "E", "f", "F", "g", "G": float
14542 - "c": int or str (1 character)
14543
Victor Stinner8dbd4212012-12-04 09:30:24 +010014544 When possible, the output is written directly into the Unicode writer
14545 (ctx->writer). A string is created when padding is required.
14546
Victor Stinnera47082312012-10-04 02:19:54 +020014547 Return 0 if the argument has been formatted into *p_str,
14548 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014549 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014550static int
14551unicode_format_arg_format(struct unicode_formatter_t *ctx,
14552 struct unicode_format_arg_t *arg,
14553 PyObject **p_str)
14554{
14555 PyObject *v;
14556 _PyUnicodeWriter *writer = &ctx->writer;
14557
14558 if (ctx->fmtcnt == 0)
14559 ctx->writer.overallocate = 0;
14560
Victor Stinnera47082312012-10-04 02:19:54 +020014561 v = unicode_format_getnextarg(ctx);
14562 if (v == NULL)
14563 return -1;
14564
Victor Stinnera47082312012-10-04 02:19:54 +020014565
14566 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014567 case 's':
14568 case 'r':
14569 case 'a':
14570 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14571 /* Fast path */
14572 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14573 return -1;
14574 return 1;
14575 }
14576
14577 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14578 *p_str = v;
14579 Py_INCREF(*p_str);
14580 }
14581 else {
14582 if (arg->ch == 's')
14583 *p_str = PyObject_Str(v);
14584 else if (arg->ch == 'r')
14585 *p_str = PyObject_Repr(v);
14586 else
14587 *p_str = PyObject_ASCII(v);
14588 }
14589 break;
14590
14591 case 'i':
14592 case 'd':
14593 case 'u':
14594 case 'o':
14595 case 'x':
14596 case 'X':
14597 {
14598 int ret = mainformatlong(v, arg, p_str, writer);
14599 if (ret != 0)
14600 return ret;
14601 arg->sign = 1;
14602 break;
14603 }
14604
14605 case 'e':
14606 case 'E':
14607 case 'f':
14608 case 'F':
14609 case 'g':
14610 case 'G':
14611 if (arg->width == -1 && arg->prec == -1
14612 && !(arg->flags & (F_SIGN | F_BLANK)))
14613 {
14614 /* Fast path */
14615 if (formatfloat(v, arg, NULL, writer) == -1)
14616 return -1;
14617 return 1;
14618 }
14619
14620 arg->sign = 1;
14621 if (formatfloat(v, arg, p_str, NULL) == -1)
14622 return -1;
14623 break;
14624
14625 case 'c':
14626 {
14627 Py_UCS4 ch = formatchar(v);
14628 if (ch == (Py_UCS4) -1)
14629 return -1;
14630 if (arg->width == -1 && arg->prec == -1) {
14631 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014632 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014633 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014634 return 1;
14635 }
14636 *p_str = PyUnicode_FromOrdinal(ch);
14637 break;
14638 }
14639
14640 default:
14641 PyErr_Format(PyExc_ValueError,
14642 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014643 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014644 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14645 (int)arg->ch,
14646 ctx->fmtpos - 1);
14647 return -1;
14648 }
14649 if (*p_str == NULL)
14650 return -1;
14651 assert (PyUnicode_Check(*p_str));
14652 return 0;
14653}
14654
14655static int
14656unicode_format_arg_output(struct unicode_formatter_t *ctx,
14657 struct unicode_format_arg_t *arg,
14658 PyObject *str)
14659{
14660 Py_ssize_t len;
14661 enum PyUnicode_Kind kind;
14662 void *pbuf;
14663 Py_ssize_t pindex;
14664 Py_UCS4 signchar;
14665 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014666 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014667 Py_ssize_t sublen;
14668 _PyUnicodeWriter *writer = &ctx->writer;
14669 Py_UCS4 fill;
14670
14671 fill = ' ';
14672 if (arg->sign && arg->flags & F_ZERO)
14673 fill = '0';
14674
14675 if (PyUnicode_READY(str) == -1)
14676 return -1;
14677
14678 len = PyUnicode_GET_LENGTH(str);
14679 if ((arg->width == -1 || arg->width <= len)
14680 && (arg->prec == -1 || arg->prec >= len)
14681 && !(arg->flags & (F_SIGN | F_BLANK)))
14682 {
14683 /* Fast path */
14684 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14685 return -1;
14686 return 0;
14687 }
14688
14689 /* Truncate the string for "s", "r" and "a" formats
14690 if the precision is set */
14691 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14692 if (arg->prec >= 0 && len > arg->prec)
14693 len = arg->prec;
14694 }
14695
14696 /* Adjust sign and width */
14697 kind = PyUnicode_KIND(str);
14698 pbuf = PyUnicode_DATA(str);
14699 pindex = 0;
14700 signchar = '\0';
14701 if (arg->sign) {
14702 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14703 if (ch == '-' || ch == '+') {
14704 signchar = ch;
14705 len--;
14706 pindex++;
14707 }
14708 else if (arg->flags & F_SIGN)
14709 signchar = '+';
14710 else if (arg->flags & F_BLANK)
14711 signchar = ' ';
14712 else
14713 arg->sign = 0;
14714 }
14715 if (arg->width < len)
14716 arg->width = len;
14717
14718 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014719 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014720 if (!(arg->flags & F_LJUST)) {
14721 if (arg->sign) {
14722 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014723 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014724 }
14725 else {
14726 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014727 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014728 }
14729 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014730 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14731 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014732 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014733 }
14734
Victor Stinnera47082312012-10-04 02:19:54 +020014735 buflen = arg->width;
14736 if (arg->sign && len == arg->width)
14737 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014738 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014739 return -1;
14740
14741 /* Write the sign if needed */
14742 if (arg->sign) {
14743 if (fill != ' ') {
14744 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14745 writer->pos += 1;
14746 }
14747 if (arg->width > len)
14748 arg->width--;
14749 }
14750
14751 /* Write the numeric prefix for "x", "X" and "o" formats
14752 if the alternate form is used.
14753 For example, write "0x" for the "%#x" format. */
14754 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14755 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14756 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14757 if (fill != ' ') {
14758 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14759 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14760 writer->pos += 2;
14761 pindex += 2;
14762 }
14763 arg->width -= 2;
14764 if (arg->width < 0)
14765 arg->width = 0;
14766 len -= 2;
14767 }
14768
14769 /* Pad left with the fill character if needed */
14770 if (arg->width > len && !(arg->flags & F_LJUST)) {
14771 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014772 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014773 writer->pos += sublen;
14774 arg->width = len;
14775 }
14776
14777 /* If padding with spaces: write sign if needed and/or numeric prefix if
14778 the alternate form is used */
14779 if (fill == ' ') {
14780 if (arg->sign) {
14781 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14782 writer->pos += 1;
14783 }
14784 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14785 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14786 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14787 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14788 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14789 writer->pos += 2;
14790 pindex += 2;
14791 }
14792 }
14793
14794 /* Write characters */
14795 if (len) {
14796 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14797 str, pindex, len);
14798 writer->pos += len;
14799 }
14800
14801 /* Pad right with the fill character if needed */
14802 if (arg->width > len) {
14803 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014804 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014805 writer->pos += sublen;
14806 }
14807 return 0;
14808}
14809
14810/* Helper of PyUnicode_Format(): format one arg.
14811 Return 0 on success, raise an exception and return -1 on error. */
14812static int
14813unicode_format_arg(struct unicode_formatter_t *ctx)
14814{
14815 struct unicode_format_arg_t arg;
14816 PyObject *str;
14817 int ret;
14818
Victor Stinner8dbd4212012-12-04 09:30:24 +010014819 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014820 if (arg.ch == '%') {
14821 ctx->fmtpos++;
14822 ctx->fmtcnt--;
14823 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14824 return -1;
14825 return 0;
14826 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014827 arg.flags = 0;
14828 arg.width = -1;
14829 arg.prec = -1;
14830 arg.sign = 0;
14831 str = NULL;
14832
Victor Stinnera47082312012-10-04 02:19:54 +020014833 ret = unicode_format_arg_parse(ctx, &arg);
14834 if (ret == -1)
14835 return -1;
14836
14837 ret = unicode_format_arg_format(ctx, &arg, &str);
14838 if (ret == -1)
14839 return -1;
14840
14841 if (ret != 1) {
14842 ret = unicode_format_arg_output(ctx, &arg, str);
14843 Py_DECREF(str);
14844 if (ret == -1)
14845 return -1;
14846 }
14847
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014848 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014849 PyErr_SetString(PyExc_TypeError,
14850 "not all arguments converted during string formatting");
14851 return -1;
14852 }
14853 return 0;
14854}
14855
Alexander Belopolsky40018472011-02-26 01:02:56 +000014856PyObject *
14857PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014858{
Victor Stinnera47082312012-10-04 02:19:54 +020014859 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014860
Guido van Rossumd57fd912000-03-10 22:53:23 +000014861 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014862 PyErr_BadInternalCall();
14863 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014864 }
Victor Stinnera47082312012-10-04 02:19:54 +020014865
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014866 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014867 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014868
14869 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014870 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14871 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14872 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14873 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014874
Victor Stinner8f674cc2013-04-17 23:02:17 +020014875 _PyUnicodeWriter_Init(&ctx.writer);
14876 ctx.writer.min_length = ctx.fmtcnt + 100;
14877 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014878
Guido van Rossumd57fd912000-03-10 22:53:23 +000014879 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014880 ctx.arglen = PyTuple_Size(args);
14881 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014882 }
14883 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014884 ctx.arglen = -1;
14885 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014886 }
Victor Stinnera47082312012-10-04 02:19:54 +020014887 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014888 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014889 ctx.dict = args;
14890 else
14891 ctx.dict = NULL;
14892 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014893
Victor Stinnera47082312012-10-04 02:19:54 +020014894 while (--ctx.fmtcnt >= 0) {
14895 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014896 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014897
14898 nonfmtpos = ctx.fmtpos++;
14899 while (ctx.fmtcnt >= 0 &&
14900 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14901 ctx.fmtpos++;
14902 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014903 }
Victor Stinnera47082312012-10-04 02:19:54 +020014904 if (ctx.fmtcnt < 0) {
14905 ctx.fmtpos--;
14906 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014907 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014908
Victor Stinnercfc4c132013-04-03 01:48:39 +020014909 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14910 nonfmtpos, ctx.fmtpos) < 0)
14911 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014912 }
14913 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014914 ctx.fmtpos++;
14915 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014916 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014917 }
14918 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014919
Victor Stinnera47082312012-10-04 02:19:54 +020014920 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014921 PyErr_SetString(PyExc_TypeError,
14922 "not all arguments converted during string formatting");
14923 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014924 }
14925
Victor Stinnera47082312012-10-04 02:19:54 +020014926 if (ctx.args_owned) {
14927 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014928 }
Victor Stinnera47082312012-10-04 02:19:54 +020014929 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014930
Benjamin Peterson29060642009-01-31 22:14:21 +000014931 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014932 _PyUnicodeWriter_Dealloc(&ctx.writer);
14933 if (ctx.args_owned) {
14934 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014935 }
14936 return NULL;
14937}
14938
Jeremy Hylton938ace62002-07-17 16:30:39 +000014939static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014940unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14941
Tim Peters6d6c1a32001-08-02 04:15:00 +000014942static PyObject *
14943unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14944{
Benjamin Peterson29060642009-01-31 22:14:21 +000014945 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014946 static char *kwlist[] = {"object", "encoding", "errors", 0};
14947 char *encoding = NULL;
14948 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014949
Benjamin Peterson14339b62009-01-31 16:36:08 +000014950 if (type != &PyUnicode_Type)
14951 return unicode_subtype_new(type, args, kwds);
14952 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014953 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014954 return NULL;
14955 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014956 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014957 if (encoding == NULL && errors == NULL)
14958 return PyObject_Str(x);
14959 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014960 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014961}
14962
Guido van Rossume023fe02001-08-30 03:12:59 +000014963static PyObject *
14964unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14965{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014966 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014967 Py_ssize_t length, char_size;
14968 int share_wstr, share_utf8;
14969 unsigned int kind;
14970 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014971
Benjamin Peterson14339b62009-01-31 16:36:08 +000014972 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014973
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014974 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014975 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014976 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014977 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014978 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014979 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014980 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014981 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014982
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014983 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014984 if (self == NULL) {
14985 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014986 return NULL;
14987 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014988 kind = PyUnicode_KIND(unicode);
14989 length = PyUnicode_GET_LENGTH(unicode);
14990
14991 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014992#ifdef Py_DEBUG
14993 _PyUnicode_HASH(self) = -1;
14994#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014995 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014996#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014997 _PyUnicode_STATE(self).interned = 0;
14998 _PyUnicode_STATE(self).kind = kind;
14999 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015000 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015001 _PyUnicode_STATE(self).ready = 1;
15002 _PyUnicode_WSTR(self) = NULL;
15003 _PyUnicode_UTF8_LENGTH(self) = 0;
15004 _PyUnicode_UTF8(self) = NULL;
15005 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015006 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015007
15008 share_utf8 = 0;
15009 share_wstr = 0;
15010 if (kind == PyUnicode_1BYTE_KIND) {
15011 char_size = 1;
15012 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15013 share_utf8 = 1;
15014 }
15015 else if (kind == PyUnicode_2BYTE_KIND) {
15016 char_size = 2;
15017 if (sizeof(wchar_t) == 2)
15018 share_wstr = 1;
15019 }
15020 else {
15021 assert(kind == PyUnicode_4BYTE_KIND);
15022 char_size = 4;
15023 if (sizeof(wchar_t) == 4)
15024 share_wstr = 1;
15025 }
15026
15027 /* Ensure we won't overflow the length. */
15028 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15029 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015030 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015031 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015032 data = PyObject_MALLOC((length + 1) * char_size);
15033 if (data == NULL) {
15034 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015035 goto onError;
15036 }
15037
Victor Stinnerc3c74152011-10-02 20:39:55 +020015038 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015039 if (share_utf8) {
15040 _PyUnicode_UTF8_LENGTH(self) = length;
15041 _PyUnicode_UTF8(self) = data;
15042 }
15043 if (share_wstr) {
15044 _PyUnicode_WSTR_LENGTH(self) = length;
15045 _PyUnicode_WSTR(self) = (wchar_t *)data;
15046 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015047
Christian Heimesf051e432016-09-13 20:22:02 +020015048 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015049 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015050 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015051#ifdef Py_DEBUG
15052 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15053#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015054 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015055 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015056
15057onError:
15058 Py_DECREF(unicode);
15059 Py_DECREF(self);
15060 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015061}
15062
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015063PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015064"str(object='') -> str\n\
15065str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015066\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015067Create a new string object from the given object. If encoding or\n\
15068errors is specified, then the object must expose a data buffer\n\
15069that will be decoded using the given encoding and error handler.\n\
15070Otherwise, returns the result of object.__str__() (if defined)\n\
15071or repr(object).\n\
15072encoding defaults to sys.getdefaultencoding().\n\
15073errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015074
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015075static PyObject *unicode_iter(PyObject *seq);
15076
Guido van Rossumd57fd912000-03-10 22:53:23 +000015077PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015078 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015079 "str", /* tp_name */
15080 sizeof(PyUnicodeObject), /* tp_basicsize */
15081 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015082 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015083 (destructor)unicode_dealloc, /* tp_dealloc */
15084 0, /* tp_print */
15085 0, /* tp_getattr */
15086 0, /* tp_setattr */
15087 0, /* tp_reserved */
15088 unicode_repr, /* tp_repr */
15089 &unicode_as_number, /* tp_as_number */
15090 &unicode_as_sequence, /* tp_as_sequence */
15091 &unicode_as_mapping, /* tp_as_mapping */
15092 (hashfunc) unicode_hash, /* tp_hash*/
15093 0, /* tp_call*/
15094 (reprfunc) unicode_str, /* tp_str */
15095 PyObject_GenericGetAttr, /* tp_getattro */
15096 0, /* tp_setattro */
15097 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015098 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015099 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15100 unicode_doc, /* tp_doc */
15101 0, /* tp_traverse */
15102 0, /* tp_clear */
15103 PyUnicode_RichCompare, /* tp_richcompare */
15104 0, /* tp_weaklistoffset */
15105 unicode_iter, /* tp_iter */
15106 0, /* tp_iternext */
15107 unicode_methods, /* tp_methods */
15108 0, /* tp_members */
15109 0, /* tp_getset */
15110 &PyBaseObject_Type, /* tp_base */
15111 0, /* tp_dict */
15112 0, /* tp_descr_get */
15113 0, /* tp_descr_set */
15114 0, /* tp_dictoffset */
15115 0, /* tp_init */
15116 0, /* tp_alloc */
15117 unicode_new, /* tp_new */
15118 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015119};
15120
15121/* Initialize the Unicode implementation */
15122
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015123_PyInitError
15124_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015125{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015126 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015127 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015128 0x000A, /* LINE FEED */
15129 0x000D, /* CARRIAGE RETURN */
15130 0x001C, /* FILE SEPARATOR */
15131 0x001D, /* GROUP SEPARATOR */
15132 0x001E, /* RECORD SEPARATOR */
15133 0x0085, /* NEXT LINE */
15134 0x2028, /* LINE SEPARATOR */
15135 0x2029, /* PARAGRAPH SEPARATOR */
15136 };
15137
Fred Drakee4315f52000-05-09 19:53:39 +000015138 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015139 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015140 if (!unicode_empty) {
15141 return _Py_INIT_ERR("Can't create empty string");
15142 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015143 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015144
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015145 if (PyType_Ready(&PyUnicode_Type) < 0) {
15146 return _Py_INIT_ERR("Can't initialize unicode type");
15147 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015148
15149 /* initialize the linebreak bloom filter */
15150 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015151 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015152 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015153
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015154 if (PyType_Ready(&EncodingMapType) < 0) {
15155 return _Py_INIT_ERR("Can't initialize encoding map type");
15156 }
15157 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15158 return _Py_INIT_ERR("Can't initialize field name iterator type");
15159 }
15160 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15161 return _Py_INIT_ERR("Can't initialize formatter iter type");
15162 }
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015163 return _Py_INIT_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015164}
15165
15166/* Finalize the Unicode implementation */
15167
Christian Heimesa156e092008-02-16 07:38:31 +000015168int
15169PyUnicode_ClearFreeList(void)
15170{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015171 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015172}
15173
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015174
Walter Dörwald16807132007-05-25 13:52:07 +000015175void
15176PyUnicode_InternInPlace(PyObject **p)
15177{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015178 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015179 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015180#ifdef Py_DEBUG
15181 assert(s != NULL);
15182 assert(_PyUnicode_CHECK(s));
15183#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015184 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015185 return;
15186#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015187 /* If it's a subclass, we don't really know what putting
15188 it in the interned dict might do. */
15189 if (!PyUnicode_CheckExact(s))
15190 return;
15191 if (PyUnicode_CHECK_INTERNED(s))
15192 return;
15193 if (interned == NULL) {
15194 interned = PyDict_New();
15195 if (interned == NULL) {
15196 PyErr_Clear(); /* Don't leave an exception */
15197 return;
15198 }
15199 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015200 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015201 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015202 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015203 if (t == NULL) {
15204 PyErr_Clear();
15205 return;
15206 }
15207 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015208 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015209 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015210 return;
15211 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015212 /* The two references in interned are not counted by refcnt.
15213 The deallocator will take care of this */
15214 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015215 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015216}
15217
15218void
15219PyUnicode_InternImmortal(PyObject **p)
15220{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015221 PyUnicode_InternInPlace(p);
15222 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015223 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015224 Py_INCREF(*p);
15225 }
Walter Dörwald16807132007-05-25 13:52:07 +000015226}
15227
15228PyObject *
15229PyUnicode_InternFromString(const char *cp)
15230{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015231 PyObject *s = PyUnicode_FromString(cp);
15232 if (s == NULL)
15233 return NULL;
15234 PyUnicode_InternInPlace(&s);
15235 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015236}
15237
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015238
15239#if defined(WITH_VALGRIND) || defined(__INSURE__)
15240static void
15241unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015242{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015243 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015244 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015245 Py_ssize_t i, n;
15246 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015247
Benjamin Peterson14339b62009-01-31 16:36:08 +000015248 if (interned == NULL || !PyDict_Check(interned))
15249 return;
15250 keys = PyDict_Keys(interned);
15251 if (keys == NULL || !PyList_Check(keys)) {
15252 PyErr_Clear();
15253 return;
15254 }
Walter Dörwald16807132007-05-25 13:52:07 +000015255
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015256 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015257 detector, interned unicode strings are not forcibly deallocated;
15258 rather, we give them their stolen references back, and then clear
15259 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015260
Benjamin Peterson14339b62009-01-31 16:36:08 +000015261 n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015262#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015263 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015264 n);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015265#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015266 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015267 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015268 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015269 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015270 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015271 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015272 case SSTATE_NOT_INTERNED:
15273 /* XXX Shouldn't happen */
15274 break;
15275 case SSTATE_INTERNED_IMMORTAL:
15276 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015277 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015278 break;
15279 case SSTATE_INTERNED_MORTAL:
15280 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015281 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015282 break;
15283 default:
15284 Py_FatalError("Inconsistent interned string state.");
15285 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015286 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015287 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015288#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015289 fprintf(stderr, "total size of all interned strings: "
15290 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15291 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015292#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015293 Py_DECREF(keys);
15294 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015295 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015296}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015297#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015298
15299
15300/********************* Unicode Iterator **************************/
15301
15302typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015303 PyObject_HEAD
15304 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015305 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015306} unicodeiterobject;
15307
15308static void
15309unicodeiter_dealloc(unicodeiterobject *it)
15310{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015311 _PyObject_GC_UNTRACK(it);
15312 Py_XDECREF(it->it_seq);
15313 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015314}
15315
15316static int
15317unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15318{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015319 Py_VISIT(it->it_seq);
15320 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015321}
15322
15323static PyObject *
15324unicodeiter_next(unicodeiterobject *it)
15325{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015326 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015327
Benjamin Peterson14339b62009-01-31 16:36:08 +000015328 assert(it != NULL);
15329 seq = it->it_seq;
15330 if (seq == NULL)
15331 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015332 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015334 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15335 int kind = PyUnicode_KIND(seq);
15336 void *data = PyUnicode_DATA(seq);
15337 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15338 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015339 if (item != NULL)
15340 ++it->it_index;
15341 return item;
15342 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015343
Benjamin Peterson14339b62009-01-31 16:36:08 +000015344 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015345 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015346 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015347}
15348
15349static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015350unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015351{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015352 Py_ssize_t len = 0;
15353 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015354 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015355 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015356}
15357
15358PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15359
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015360static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015361unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015362{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015363 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015364 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015365 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015366 it->it_seq, it->it_index);
15367 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015368 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015369 if (u == NULL)
15370 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015371 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015372 }
15373}
15374
15375PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15376
15377static PyObject *
15378unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15379{
15380 Py_ssize_t index = PyLong_AsSsize_t(state);
15381 if (index == -1 && PyErr_Occurred())
15382 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015383 if (it->it_seq != NULL) {
15384 if (index < 0)
15385 index = 0;
15386 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15387 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15388 it->it_index = index;
15389 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015390 Py_RETURN_NONE;
15391}
15392
15393PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15394
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015395static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015396 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015397 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015398 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15399 reduce_doc},
15400 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15401 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015402 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015403};
15404
15405PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015406 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15407 "str_iterator", /* tp_name */
15408 sizeof(unicodeiterobject), /* tp_basicsize */
15409 0, /* tp_itemsize */
15410 /* methods */
15411 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15412 0, /* tp_print */
15413 0, /* tp_getattr */
15414 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015415 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015416 0, /* tp_repr */
15417 0, /* tp_as_number */
15418 0, /* tp_as_sequence */
15419 0, /* tp_as_mapping */
15420 0, /* tp_hash */
15421 0, /* tp_call */
15422 0, /* tp_str */
15423 PyObject_GenericGetAttr, /* tp_getattro */
15424 0, /* tp_setattro */
15425 0, /* tp_as_buffer */
15426 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15427 0, /* tp_doc */
15428 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15429 0, /* tp_clear */
15430 0, /* tp_richcompare */
15431 0, /* tp_weaklistoffset */
15432 PyObject_SelfIter, /* tp_iter */
15433 (iternextfunc)unicodeiter_next, /* tp_iternext */
15434 unicodeiter_methods, /* tp_methods */
15435 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015436};
15437
15438static PyObject *
15439unicode_iter(PyObject *seq)
15440{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015441 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015442
Benjamin Peterson14339b62009-01-31 16:36:08 +000015443 if (!PyUnicode_Check(seq)) {
15444 PyErr_BadInternalCall();
15445 return NULL;
15446 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015447 if (PyUnicode_READY(seq) == -1)
15448 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015449 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15450 if (it == NULL)
15451 return NULL;
15452 it->it_index = 0;
15453 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015454 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015455 _PyObject_GC_TRACK(it);
15456 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015457}
15458
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015459
15460size_t
15461Py_UNICODE_strlen(const Py_UNICODE *u)
15462{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015463 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015464}
15465
15466Py_UNICODE*
15467Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15468{
15469 Py_UNICODE *u = s1;
15470 while ((*u++ = *s2++));
15471 return s1;
15472}
15473
15474Py_UNICODE*
15475Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15476{
15477 Py_UNICODE *u = s1;
15478 while ((*u++ = *s2++))
15479 if (n-- == 0)
15480 break;
15481 return s1;
15482}
15483
15484Py_UNICODE*
15485Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15486{
15487 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015488 u1 += wcslen(u1);
15489 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015490 return s1;
15491}
15492
15493int
15494Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15495{
15496 while (*s1 && *s2 && *s1 == *s2)
15497 s1++, s2++;
15498 if (*s1 && *s2)
15499 return (*s1 < *s2) ? -1 : +1;
15500 if (*s1)
15501 return 1;
15502 if (*s2)
15503 return -1;
15504 return 0;
15505}
15506
15507int
15508Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15509{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015510 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015511 for (; n != 0; n--) {
15512 u1 = *s1;
15513 u2 = *s2;
15514 if (u1 != u2)
15515 return (u1 < u2) ? -1 : +1;
15516 if (u1 == '\0')
15517 return 0;
15518 s1++;
15519 s2++;
15520 }
15521 return 0;
15522}
15523
15524Py_UNICODE*
15525Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15526{
15527 const Py_UNICODE *p;
15528 for (p = s; *p; p++)
15529 if (*p == c)
15530 return (Py_UNICODE*)p;
15531 return NULL;
15532}
15533
15534Py_UNICODE*
15535Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15536{
15537 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015538 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015539 while (p != s) {
15540 p--;
15541 if (*p == c)
15542 return (Py_UNICODE*)p;
15543 }
15544 return NULL;
15545}
Victor Stinner331ea922010-08-10 16:37:20 +000015546
Victor Stinner71133ff2010-09-01 23:43:53 +000015547Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015548PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015549{
Victor Stinner577db2c2011-10-11 22:12:48 +020015550 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015551 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015552
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015553 if (!PyUnicode_Check(unicode)) {
15554 PyErr_BadArgument();
15555 return NULL;
15556 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015557 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015558 if (u == NULL)
15559 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015560 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015561 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015562 PyErr_NoMemory();
15563 return NULL;
15564 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015565 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015566 size *= sizeof(Py_UNICODE);
15567 copy = PyMem_Malloc(size);
15568 if (copy == NULL) {
15569 PyErr_NoMemory();
15570 return NULL;
15571 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015572 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015573 return copy;
15574}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015575
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015576
15577void
15578_PyUnicode_Fini(void)
15579{
15580#if defined(WITH_VALGRIND) || defined(__INSURE__)
15581 /* Insure++ is a memory analysis tool that aids in discovering
15582 * memory leaks and other memory problems. On Python exit, the
15583 * interned string dictionaries are flagged as being in use at exit
15584 * (which it is). Under normal circumstances, this is fine because
15585 * the memory will be automatically reclaimed by the system. Under
15586 * memory debugging, it's a huge source of useless noise, so we
15587 * trade off slower shutdown for less distraction in the memory
15588 * reports. -baw
15589 */
15590 unicode_release_interned();
15591#endif /* __INSURE__ */
15592
15593 Py_CLEAR(unicode_empty);
15594
15595 for (Py_ssize_t i = 0; i < 256; i++) {
15596 Py_CLEAR(unicode_latin1[i]);
15597 }
15598 _PyUnicode_ClearStaticStrings();
15599 (void)PyUnicode_ClearFreeList();
15600}
15601
15602
Georg Brandl66c221e2010-10-14 07:04:07 +000015603/* A _string module, to export formatter_parser and formatter_field_name_split
15604 to the string.Formatter class implemented in Python. */
15605
15606static PyMethodDef _string_methods[] = {
15607 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15608 METH_O, PyDoc_STR("split the argument as a field name")},
15609 {"formatter_parser", (PyCFunction) formatter_parser,
15610 METH_O, PyDoc_STR("parse the argument as a format string")},
15611 {NULL, NULL}
15612};
15613
15614static struct PyModuleDef _string_module = {
15615 PyModuleDef_HEAD_INIT,
15616 "_string",
15617 PyDoc_STR("string helper module"),
15618 0,
15619 _string_methods,
15620 NULL,
15621 NULL,
15622 NULL,
15623 NULL
15624};
15625
15626PyMODINIT_FUNC
15627PyInit__string(void)
15628{
15629 return PyModule_Create(&_string_module);
15630}
15631
15632
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015633#ifdef __cplusplus
15634}
15635#endif