blob: 1433848c81f8e1c3eb4e6b153a8d8d157094ee81 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020043#include "pycore_abstract.h" // _PyIndex_Check()
Victor Stinner45876a92020-02-12 22:32:34 +010044#include "pycore_bytes_methods.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010045#include "pycore_fileutils.h"
Victor Stinner61691d82019-10-02 23:51:20 +020046#include "pycore_initconfig.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020047#include "pycore_interp.h" // PyInterpreterState.fs_codec
Victor Stinnerbcda8f12018-11-21 22:27:47 +010048#include "pycore_object.h"
Victor Stinner61691d82019-10-02 23:51:20 +020049#include "pycore_pathconfig.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040050#include "pycore_pylifecycle.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020051#include "pycore_pystate.h" // _PyInterpreterState_GET()
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000052#include "ucnhash.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070053#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000054
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000055#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000056#include <windows.h>
57#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000058
Victor Stinnerfecc4f22019-03-19 14:20:29 +010059/* Uncomment to display statistics on interned strings at exit when
60 using Valgrind or Insecure++. */
61/* #define INTERNED_STATS 1 */
62
63
Larry Hastings61272b72014-01-07 12:41:53 -080064/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090065class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080066[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090067/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
68
69/*[python input]
70class Py_UCS4_converter(CConverter):
71 type = 'Py_UCS4'
72 converter = 'convert_uc'
73
74 def converter_init(self):
75 if self.default is not unspecified:
76 self.c_default = ascii(self.default)
77 if len(self.c_default) > 4 or self.c_default[0] != "'":
78 self.c_default = hex(ord(self.default))
79
80[python start generated code]*/
81/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080082
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
Serhiy Storchaka05997252013-01-26 12:14:02 +020085NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner8faf8212011-12-08 22:14:11 +010096/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
97#define MAX_UNICODE 0x10ffff
98
Victor Stinner910337b2011-10-03 03:20:16 +020099#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200100# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#else
102# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
103#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200104
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200105#define _PyUnicode_UTF8(op) \
106 (((PyCompactUnicodeObject*)(op))->utf8)
107#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200108 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200109 assert(PyUnicode_IS_READY(op)), \
110 PyUnicode_IS_COMPACT_ASCII(op) ? \
111 ((char*)((PyASCIIObject*)(op) + 1)) : \
112 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200113#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 (((PyCompactUnicodeObject*)(op))->utf8_length)
115#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200116 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200117 assert(PyUnicode_IS_READY(op)), \
118 PyUnicode_IS_COMPACT_ASCII(op) ? \
119 ((PyASCIIObject*)(op))->length : \
120 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200121#define _PyUnicode_WSTR(op) \
122 (((PyASCIIObject*)(op))->wstr)
Inada Naoki2c4928d2020-06-17 20:09:44 +0900123
124/* Don't use deprecated macro of unicodeobject.h */
125#undef PyUnicode_WSTR_LENGTH
126#define PyUnicode_WSTR_LENGTH(op) \
127 (PyUnicode_IS_COMPACT_ASCII(op) ? \
128 ((PyASCIIObject*)op)->length : \
129 ((PyCompactUnicodeObject*)op)->wstr_length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200130#define _PyUnicode_WSTR_LENGTH(op) \
131 (((PyCompactUnicodeObject*)(op))->wstr_length)
132#define _PyUnicode_LENGTH(op) \
133 (((PyASCIIObject *)(op))->length)
134#define _PyUnicode_STATE(op) \
135 (((PyASCIIObject *)(op))->state)
136#define _PyUnicode_HASH(op) \
137 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200138#define _PyUnicode_KIND(op) \
139 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200140 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200141#define _PyUnicode_GET_LENGTH(op) \
142 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200143 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200144#define _PyUnicode_DATA_ANY(op) \
145 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200146
Victor Stinner910337b2011-10-03 03:20:16 +0200147#undef PyUnicode_READY
148#define PyUnicode_READY(op) \
149 (assert(_PyUnicode_CHECK(op)), \
150 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200151 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100152 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200153
Victor Stinnerc379ead2011-10-03 12:52:27 +0200154#define _PyUnicode_SHARE_UTF8(op) \
155 (assert(_PyUnicode_CHECK(op)), \
156 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
157 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
158#define _PyUnicode_SHARE_WSTR(op) \
159 (assert(_PyUnicode_CHECK(op)), \
160 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
161
Victor Stinner829c0ad2011-10-03 01:08:02 +0200162/* true if the Unicode object has an allocated UTF-8 memory block
163 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200164#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200165 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200166 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200167 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
168
Victor Stinner03490912011-10-03 23:45:12 +0200169/* true if the Unicode object has an allocated wstr memory block
170 (not shared with other data) */
171#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200172 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200173 (!PyUnicode_IS_READY(op) || \
174 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
175
Victor Stinner910337b2011-10-03 03:20:16 +0200176/* Generic helper macro to convert characters of different types.
177 from_type and to_type have to be valid type names, begin and end
178 are pointers to the source characters which should be of type
179 "from_type *". to is a pointer of type "to_type *" and points to the
180 buffer where the result characters are written to. */
181#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
182 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100183 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600184 const from_type *_iter = (const from_type *)(begin);\
185 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200186 Py_ssize_t n = (_end) - (_iter); \
187 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200188 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200189 while (_iter < (_unrolled_end)) { \
190 _to[0] = (to_type) _iter[0]; \
191 _to[1] = (to_type) _iter[1]; \
192 _to[2] = (to_type) _iter[2]; \
193 _to[3] = (to_type) _iter[3]; \
194 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200195 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200196 while (_iter < (_end)) \
197 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200198 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200199
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200200#ifdef MS_WINDOWS
201 /* On Windows, overallocate by 50% is the best factor */
202# define OVERALLOCATE_FACTOR 2
203#else
204 /* On Linux, overallocate by 25% is the best factor */
205# define OVERALLOCATE_FACTOR 4
206#endif
207
Victor Stinner607b1022020-05-05 18:50:30 +0200208/* bpo-40521: Interned strings are shared by all interpreters. */
209#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
210# define INTERNED_STRINGS
211#endif
212
Walter Dörwald16807132007-05-25 13:52:07 +0000213/* This dictionary holds all interned unicode strings. Note that references
214 to strings in this dictionary are *not* counted in the string's ob_refcnt.
215 When the interned string reaches a refcnt of 0 the string deallocation
216 function will delete the reference from this dictionary.
217
218 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000219 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000220*/
Victor Stinner607b1022020-05-05 18:50:30 +0200221#ifdef INTERNED_STRINGS
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222static PyObject *interned = NULL;
Victor Stinner607b1022020-05-05 18:50:30 +0200223#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000224
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000225/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200226static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200227
Serhiy Storchaka678db842013-01-26 12:16:36 +0200228#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200229 do { \
230 if (unicode_empty != NULL) \
231 Py_INCREF(unicode_empty); \
232 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200233 unicode_empty = PyUnicode_New(0, 0); \
234 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200235 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200236 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
237 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200238 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200239 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000240
Serhiy Storchaka678db842013-01-26 12:16:36 +0200241#define _Py_RETURN_UNICODE_EMPTY() \
242 do { \
243 _Py_INCREF_UNICODE_EMPTY(); \
244 return unicode_empty; \
245 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000246
Victor Stinner59423e32018-11-26 13:40:01 +0100247static inline void
248unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
249 Py_ssize_t start, Py_ssize_t length)
250{
251 assert(0 <= start);
252 assert(kind != PyUnicode_WCHAR_KIND);
253 switch (kind) {
254 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100255 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100256 Py_UCS1 ch = (unsigned char)value;
257 Py_UCS1 *to = (Py_UCS1 *)data + start;
258 memset(to, ch, length);
259 break;
260 }
261 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100262 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100263 Py_UCS2 ch = (Py_UCS2)value;
264 Py_UCS2 *to = (Py_UCS2 *)data + start;
265 const Py_UCS2 *end = to + length;
266 for (; to < end; ++to) *to = ch;
267 break;
268 }
269 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100270 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100271 Py_UCS4 ch = value;
272 Py_UCS4 * to = (Py_UCS4 *)data + start;
273 const Py_UCS4 *end = to + length;
274 for (; to < end; ++to) *to = ch;
275 break;
276 }
277 default: Py_UNREACHABLE();
278 }
279}
280
281
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200282/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700283static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200284_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900285static inline void
286_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400287static PyObject *
288unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
289 const char *errors);
290static PyObject *
291unicode_decode_utf8(const char *s, Py_ssize_t size,
292 _Py_error_handler error_handler, const char *errors,
293 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200294
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200295/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200296static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200297
Victor Stinner607b1022020-05-05 18:50:30 +0200298/* bpo-40521: Latin1 singletons are shared by all interpreters. */
299#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
300# define LATIN1_SINGLETONS
301#endif
302
303#ifdef LATIN1_SINGLETONS
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000304/* Single character Unicode strings in the Latin-1 range are being
305 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200306static PyObject *unicode_latin1[256] = {NULL};
Victor Stinner607b1022020-05-05 18:50:30 +0200307#endif
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000308
Christian Heimes190d79e2008-01-30 11:58:22 +0000309/* Fast detection of the most frequent whitespace characters */
310const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000311 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000312/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000313/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000314/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000315/* case 0x000C: * FORM FEED */
316/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000317 0, 1, 1, 1, 1, 1, 0, 0,
318 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000319/* case 0x001C: * FILE SEPARATOR */
320/* case 0x001D: * GROUP SEPARATOR */
321/* case 0x001E: * RECORD SEPARATOR */
322/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000323 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000324/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000325 1, 0, 0, 0, 0, 0, 0, 0,
326 0, 0, 0, 0, 0, 0, 0, 0,
327 0, 0, 0, 0, 0, 0, 0, 0,
328 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000329
Benjamin Peterson14339b62009-01-31 16:36:08 +0000330 0, 0, 0, 0, 0, 0, 0, 0,
331 0, 0, 0, 0, 0, 0, 0, 0,
332 0, 0, 0, 0, 0, 0, 0, 0,
333 0, 0, 0, 0, 0, 0, 0, 0,
334 0, 0, 0, 0, 0, 0, 0, 0,
335 0, 0, 0, 0, 0, 0, 0, 0,
336 0, 0, 0, 0, 0, 0, 0, 0,
337 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000338};
339
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200340/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200341static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200342static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100343static int unicode_modifiable(PyObject *unicode);
344
Victor Stinnerfe226c02011-10-03 03:52:20 +0200345
Alexander Belopolsky40018472011-02-26 01:02:56 +0000346static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100347_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200348static PyObject *
349_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
350static PyObject *
351_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
352
353static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000354unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000355 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100356 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000357 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
358
Alexander Belopolsky40018472011-02-26 01:02:56 +0000359static void
360raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300361 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100362 PyObject *unicode,
363 Py_ssize_t startpos, Py_ssize_t endpos,
364 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000365
Christian Heimes190d79e2008-01-30 11:58:22 +0000366/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200367static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000368 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000369/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000370/* 0x000B, * LINE TABULATION */
371/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000372/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000373 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000374 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000375/* 0x001C, * FILE SEPARATOR */
376/* 0x001D, * GROUP SEPARATOR */
377/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000378 0, 0, 0, 0, 1, 1, 1, 0,
379 0, 0, 0, 0, 0, 0, 0, 0,
380 0, 0, 0, 0, 0, 0, 0, 0,
381 0, 0, 0, 0, 0, 0, 0, 0,
382 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000383
Benjamin Peterson14339b62009-01-31 16:36:08 +0000384 0, 0, 0, 0, 0, 0, 0, 0,
385 0, 0, 0, 0, 0, 0, 0, 0,
386 0, 0, 0, 0, 0, 0, 0, 0,
387 0, 0, 0, 0, 0, 0, 0, 0,
388 0, 0, 0, 0, 0, 0, 0, 0,
389 0, 0, 0, 0, 0, 0, 0, 0,
390 0, 0, 0, 0, 0, 0, 0, 0,
391 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000392};
393
INADA Naoki3ae20562017-01-16 20:41:20 +0900394static int convert_uc(PyObject *obj, void *addr);
395
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300396#include "clinic/unicodeobject.c.h"
397
Victor Stinner3d4226a2018-08-29 22:21:32 +0200398_Py_error_handler
399_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200400{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200401 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200402 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200403 }
404 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200405 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200406 }
407 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200408 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200409 }
410 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200411 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200412 }
413 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200414 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200415 }
416 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200417 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200418 }
419 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200420 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200421 }
Victor Stinner50149202015-09-22 00:26:54 +0200422 return _Py_ERROR_OTHER;
423}
424
Victor Stinner709d23d2019-05-02 14:56:30 -0400425
426static _Py_error_handler
427get_error_handler_wide(const wchar_t *errors)
428{
429 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
430 return _Py_ERROR_STRICT;
431 }
432 if (wcscmp(errors, L"surrogateescape") == 0) {
433 return _Py_ERROR_SURROGATEESCAPE;
434 }
435 if (wcscmp(errors, L"replace") == 0) {
436 return _Py_ERROR_REPLACE;
437 }
438 if (wcscmp(errors, L"ignore") == 0) {
439 return _Py_ERROR_IGNORE;
440 }
441 if (wcscmp(errors, L"backslashreplace") == 0) {
442 return _Py_ERROR_BACKSLASHREPLACE;
443 }
444 if (wcscmp(errors, L"surrogatepass") == 0) {
445 return _Py_ERROR_SURROGATEPASS;
446 }
447 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
448 return _Py_ERROR_XMLCHARREFREPLACE;
449 }
450 return _Py_ERROR_OTHER;
451}
452
453
Victor Stinner22eb6892019-06-26 00:51:05 +0200454static inline int
455unicode_check_encoding_errors(const char *encoding, const char *errors)
456{
457 if (encoding == NULL && errors == NULL) {
458 return 0;
459 }
460
Victor Stinner81a7be32020-04-14 15:14:01 +0200461 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200462#ifndef Py_DEBUG
463 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200464 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200465 return 0;
466 }
467#else
468 /* Always check in debug mode */
469#endif
470
471 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
472 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
Victor Stinner3d17c042020-05-14 01:48:38 +0200473 if (!interp->unicode.fs_codec.encoding) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200474 return 0;
475 }
476
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200477 /* Disable checks during Python finalization. For example, it allows to
478 call _PyObject_Dump() during finalization for debugging purpose. */
479 if (interp->finalizing) {
480 return 0;
481 }
482
Victor Stinner22eb6892019-06-26 00:51:05 +0200483 if (encoding != NULL) {
484 PyObject *handler = _PyCodec_Lookup(encoding);
485 if (handler == NULL) {
486 return -1;
487 }
488 Py_DECREF(handler);
489 }
490
491 if (errors != NULL) {
492 PyObject *handler = PyCodec_LookupError(errors);
493 if (handler == NULL) {
494 return -1;
495 }
496 Py_DECREF(handler);
497 }
498 return 0;
499}
500
501
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300502/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
503 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000504Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000505PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000506{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000507#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000508 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000509#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000510 /* This is actually an illegal character, so it should
511 not be passed to unichr. */
512 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000513#endif
514}
515
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200516int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100517_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200518{
Victor Stinner68762572019-10-07 18:42:01 +0200519#define CHECK(expr) \
520 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
521
Victor Stinner910337b2011-10-03 03:20:16 +0200522 PyASCIIObject *ascii;
523 unsigned int kind;
524
Victor Stinner68762572019-10-07 18:42:01 +0200525 assert(op != NULL);
526 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200527
528 ascii = (PyASCIIObject *)op;
529 kind = ascii->state.kind;
530
Victor Stinnera3b334d2011-10-03 13:53:37 +0200531 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200532 CHECK(kind == PyUnicode_1BYTE_KIND);
533 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200534 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200535 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200536 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200537 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200538
Victor Stinnera41463c2011-10-04 01:05:08 +0200539 if (ascii->state.compact == 1) {
540 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200541 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200542 || kind == PyUnicode_2BYTE_KIND
543 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200544 CHECK(ascii->state.ascii == 0);
545 CHECK(ascii->state.ready == 1);
546 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100547 }
548 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200549 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
550
551 data = unicode->data.any;
552 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200553 CHECK(ascii->length == 0);
554 CHECK(ascii->hash == -1);
555 CHECK(ascii->state.compact == 0);
556 CHECK(ascii->state.ascii == 0);
557 CHECK(ascii->state.ready == 0);
558 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
559 CHECK(ascii->wstr != NULL);
560 CHECK(data == NULL);
561 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200562 }
563 else {
Victor Stinner68762572019-10-07 18:42:01 +0200564 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200565 || kind == PyUnicode_2BYTE_KIND
566 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200567 CHECK(ascii->state.compact == 0);
568 CHECK(ascii->state.ready == 1);
569 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200570 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200571 CHECK(compact->utf8 == data);
572 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200573 }
574 else
Victor Stinner68762572019-10-07 18:42:01 +0200575 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200576 }
577 }
578 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200579 if (
580#if SIZEOF_WCHAR_T == 2
581 kind == PyUnicode_2BYTE_KIND
582#else
583 kind == PyUnicode_4BYTE_KIND
584#endif
585 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200586 {
Victor Stinner68762572019-10-07 18:42:01 +0200587 CHECK(ascii->wstr == data);
588 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200589 } else
Victor Stinner68762572019-10-07 18:42:01 +0200590 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200591 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200592
593 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200594 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200595 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200596 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200597 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200598
599 /* check that the best kind is used: O(n) operation */
600 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200601 Py_ssize_t i;
602 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300603 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200604 Py_UCS4 ch;
605
606 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200607 for (i=0; i < ascii->length; i++)
608 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200609 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200610 if (ch > maxchar)
611 maxchar = ch;
612 }
613 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100614 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200615 CHECK(maxchar >= 128);
616 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100617 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200618 else
Victor Stinner68762572019-10-07 18:42:01 +0200619 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200620 }
Victor Stinner77faf692011-11-20 18:56:05 +0100621 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200622 CHECK(maxchar >= 0x100);
623 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100624 }
625 else {
Victor Stinner68762572019-10-07 18:42:01 +0200626 CHECK(maxchar >= 0x10000);
627 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100628 }
Victor Stinner68762572019-10-07 18:42:01 +0200629 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200630 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400631 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200632
633#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400634}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200635
Victor Stinner910337b2011-10-03 03:20:16 +0200636
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100637static PyObject*
638unicode_result_wchar(PyObject *unicode)
639{
640#ifndef Py_DEBUG
641 Py_ssize_t len;
642
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100643 len = _PyUnicode_WSTR_LENGTH(unicode);
644 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100645 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200646 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100647 }
648
649 if (len == 1) {
650 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100651 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100652 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
653 Py_DECREF(unicode);
654 return latin1_char;
655 }
656 }
657
658 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200659 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100660 return NULL;
661 }
662#else
Victor Stinneraa771272012-10-04 02:32:58 +0200663 assert(Py_REFCNT(unicode) == 1);
664
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100665 /* don't make the result ready in debug mode to ensure that the caller
666 makes the string ready before using it */
667 assert(_PyUnicode_CheckConsistency(unicode, 1));
668#endif
669 return unicode;
670}
671
672static PyObject*
673unicode_result_ready(PyObject *unicode)
674{
675 Py_ssize_t length;
676
677 length = PyUnicode_GET_LENGTH(unicode);
678 if (length == 0) {
679 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100680 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200681 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100682 }
683 return unicode_empty;
684 }
685
Victor Stinner607b1022020-05-05 18:50:30 +0200686#ifdef LATIN1_SINGLETONS
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100687 if (length == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300688 const void *data = PyUnicode_DATA(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +0200689 int kind = PyUnicode_KIND(unicode);
690 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100691 if (ch < 256) {
692 PyObject *latin1_char = unicode_latin1[ch];
693 if (latin1_char != NULL) {
694 if (unicode != latin1_char) {
695 Py_INCREF(latin1_char);
696 Py_DECREF(unicode);
697 }
698 return latin1_char;
699 }
700 else {
701 assert(_PyUnicode_CheckConsistency(unicode, 1));
702 Py_INCREF(unicode);
703 unicode_latin1[ch] = unicode;
704 return unicode;
705 }
706 }
707 }
Victor Stinner607b1022020-05-05 18:50:30 +0200708#endif
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100709
710 assert(_PyUnicode_CheckConsistency(unicode, 1));
711 return unicode;
712}
713
714static PyObject*
715unicode_result(PyObject *unicode)
716{
717 assert(_PyUnicode_CHECK(unicode));
718 if (PyUnicode_IS_READY(unicode))
719 return unicode_result_ready(unicode);
720 else
721 return unicode_result_wchar(unicode);
722}
723
Victor Stinnerc4b49542011-12-11 22:44:26 +0100724static PyObject*
725unicode_result_unchanged(PyObject *unicode)
726{
727 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500728 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100729 return NULL;
730 Py_INCREF(unicode);
731 return unicode;
732 }
733 else
734 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100735 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100736}
737
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200738/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
739 ASCII, Latin1, UTF-8, etc. */
740static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200741backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200742 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
743{
Victor Stinnerad771582015-10-09 12:38:53 +0200744 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200745 Py_UCS4 ch;
746 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300747 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200748
749 assert(PyUnicode_IS_READY(unicode));
750 kind = PyUnicode_KIND(unicode);
751 data = PyUnicode_DATA(unicode);
752
753 size = 0;
754 /* determine replacement size */
755 for (i = collstart; i < collend; ++i) {
756 Py_ssize_t incr;
757
758 ch = PyUnicode_READ(kind, data, i);
759 if (ch < 0x100)
760 incr = 2+2;
761 else if (ch < 0x10000)
762 incr = 2+4;
763 else {
764 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200765 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200766 }
767 if (size > PY_SSIZE_T_MAX - incr) {
768 PyErr_SetString(PyExc_OverflowError,
769 "encoded result is too long for a Python string");
770 return NULL;
771 }
772 size += incr;
773 }
774
Victor Stinnerad771582015-10-09 12:38:53 +0200775 str = _PyBytesWriter_Prepare(writer, str, size);
776 if (str == NULL)
777 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200778
779 /* generate replacement */
780 for (i = collstart; i < collend; ++i) {
781 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200782 *str++ = '\\';
783 if (ch >= 0x00010000) {
784 *str++ = 'U';
785 *str++ = Py_hexdigits[(ch>>28)&0xf];
786 *str++ = Py_hexdigits[(ch>>24)&0xf];
787 *str++ = Py_hexdigits[(ch>>20)&0xf];
788 *str++ = Py_hexdigits[(ch>>16)&0xf];
789 *str++ = Py_hexdigits[(ch>>12)&0xf];
790 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200791 }
Victor Stinner797485e2015-10-09 03:17:30 +0200792 else if (ch >= 0x100) {
793 *str++ = 'u';
794 *str++ = Py_hexdigits[(ch>>12)&0xf];
795 *str++ = Py_hexdigits[(ch>>8)&0xf];
796 }
797 else
798 *str++ = 'x';
799 *str++ = Py_hexdigits[(ch>>4)&0xf];
800 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200801 }
802 return str;
803}
804
805/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
806 ASCII, Latin1, UTF-8, etc. */
807static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200808xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200809 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
810{
Victor Stinnerad771582015-10-09 12:38:53 +0200811 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200812 Py_UCS4 ch;
813 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300814 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200815
816 assert(PyUnicode_IS_READY(unicode));
817 kind = PyUnicode_KIND(unicode);
818 data = PyUnicode_DATA(unicode);
819
820 size = 0;
821 /* determine replacement size */
822 for (i = collstart; i < collend; ++i) {
823 Py_ssize_t incr;
824
825 ch = PyUnicode_READ(kind, data, i);
826 if (ch < 10)
827 incr = 2+1+1;
828 else if (ch < 100)
829 incr = 2+2+1;
830 else if (ch < 1000)
831 incr = 2+3+1;
832 else if (ch < 10000)
833 incr = 2+4+1;
834 else if (ch < 100000)
835 incr = 2+5+1;
836 else if (ch < 1000000)
837 incr = 2+6+1;
838 else {
839 assert(ch <= MAX_UNICODE);
840 incr = 2+7+1;
841 }
842 if (size > PY_SSIZE_T_MAX - incr) {
843 PyErr_SetString(PyExc_OverflowError,
844 "encoded result is too long for a Python string");
845 return NULL;
846 }
847 size += incr;
848 }
849
Victor Stinnerad771582015-10-09 12:38:53 +0200850 str = _PyBytesWriter_Prepare(writer, str, size);
851 if (str == NULL)
852 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200853
854 /* generate replacement */
855 for (i = collstart; i < collend; ++i) {
856 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
857 }
858 return str;
859}
860
Thomas Wouters477c8d52006-05-27 19:21:47 +0000861/* --- Bloom Filters ----------------------------------------------------- */
862
863/* stuff to implement simple "bloom filters" for Unicode characters.
864 to keep things simple, we use a single bitmask, using the least 5
865 bits from each unicode characters as the bit index. */
866
867/* the linebreak mask is set up by Unicode_Init below */
868
Antoine Pitrouf068f942010-01-13 14:19:12 +0000869#if LONG_BIT >= 128
870#define BLOOM_WIDTH 128
871#elif LONG_BIT >= 64
872#define BLOOM_WIDTH 64
873#elif LONG_BIT >= 32
874#define BLOOM_WIDTH 32
875#else
876#error "LONG_BIT is smaller than 32"
877#endif
878
Thomas Wouters477c8d52006-05-27 19:21:47 +0000879#define BLOOM_MASK unsigned long
880
Serhiy Storchaka05997252013-01-26 12:14:02 +0200881static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000882
Antoine Pitrouf068f942010-01-13 14:19:12 +0000883#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000884
Benjamin Peterson29060642009-01-31 22:14:21 +0000885#define BLOOM_LINEBREAK(ch) \
886 ((ch) < 128U ? ascii_linebreak[(ch)] : \
887 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000888
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700889static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300890make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000891{
Victor Stinnera85af502013-04-09 21:53:54 +0200892#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
893 do { \
894 TYPE *data = (TYPE *)PTR; \
895 TYPE *end = data + LEN; \
896 Py_UCS4 ch; \
897 for (; data != end; data++) { \
898 ch = *data; \
899 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
900 } \
901 break; \
902 } while (0)
903
Thomas Wouters477c8d52006-05-27 19:21:47 +0000904 /* calculate simple bloom-style bitmask for a given unicode string */
905
Antoine Pitrouf068f942010-01-13 14:19:12 +0000906 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000907
908 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200909 switch (kind) {
910 case PyUnicode_1BYTE_KIND:
911 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
912 break;
913 case PyUnicode_2BYTE_KIND:
914 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
915 break;
916 case PyUnicode_4BYTE_KIND:
917 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
918 break;
919 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700920 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200921 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000922 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200923
924#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000925}
926
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300927static int
928ensure_unicode(PyObject *obj)
929{
930 if (!PyUnicode_Check(obj)) {
931 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200932 "must be str, not %.100s",
933 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300934 return -1;
935 }
936 return PyUnicode_READY(obj);
937}
938
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200939/* Compilation of templated routines */
940
941#include "stringlib/asciilib.h"
942#include "stringlib/fastsearch.h"
943#include "stringlib/partition.h"
944#include "stringlib/split.h"
945#include "stringlib/count.h"
946#include "stringlib/find.h"
947#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200948#include "stringlib/undef.h"
949
950#include "stringlib/ucs1lib.h"
951#include "stringlib/fastsearch.h"
952#include "stringlib/partition.h"
953#include "stringlib/split.h"
954#include "stringlib/count.h"
955#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300956#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200957#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200958#include "stringlib/undef.h"
959
960#include "stringlib/ucs2lib.h"
961#include "stringlib/fastsearch.h"
962#include "stringlib/partition.h"
963#include "stringlib/split.h"
964#include "stringlib/count.h"
965#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300966#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200967#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200968#include "stringlib/undef.h"
969
970#include "stringlib/ucs4lib.h"
971#include "stringlib/fastsearch.h"
972#include "stringlib/partition.h"
973#include "stringlib/split.h"
974#include "stringlib/count.h"
975#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300976#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200977#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200978#include "stringlib/undef.h"
979
Inada Naoki2c4928d2020-06-17 20:09:44 +0900980_Py_COMP_DIAG_PUSH
981_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200982#include "stringlib/unicodedefs.h"
983#include "stringlib/fastsearch.h"
984#include "stringlib/count.h"
985#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100986#include "stringlib/undef.h"
Inada Naoki2c4928d2020-06-17 20:09:44 +0900987_Py_COMP_DIAG_POP
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200988
Guido van Rossumd57fd912000-03-10 22:53:23 +0000989/* --- Unicode Object ----------------------------------------------------- */
990
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700991static inline Py_ssize_t
992findchar(const void *s, int kind,
993 Py_ssize_t size, Py_UCS4 ch,
994 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200995{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200996 switch (kind) {
997 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200998 if ((Py_UCS1) ch != ch)
999 return -1;
1000 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001001 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001002 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001003 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001004 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001005 if ((Py_UCS2) ch != ch)
1006 return -1;
1007 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001008 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001009 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001010 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001011 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001012 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001013 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001014 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001015 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001016 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07001017 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +02001018 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001019}
1020
Victor Stinnerafffce42012-10-03 23:03:17 +02001021#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +00001022/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +02001023 earlier.
1024
1025 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1026 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1027 invalid character in Unicode 6.0. */
1028static void
1029unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1030{
1031 int kind = PyUnicode_KIND(unicode);
1032 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1033 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1034 if (length <= old_length)
1035 return;
1036 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1037}
1038#endif
1039
Victor Stinnerfe226c02011-10-03 03:52:20 +02001040static PyObject*
1041resize_compact(PyObject *unicode, Py_ssize_t length)
1042{
1043 Py_ssize_t char_size;
1044 Py_ssize_t struct_size;
1045 Py_ssize_t new_size;
1046 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001047 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001048#ifdef Py_DEBUG
1049 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1050#endif
1051
Victor Stinner79891572012-05-03 13:43:07 +02001052 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001053 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001054 assert(PyUnicode_IS_COMPACT(unicode));
1055
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001056 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001057 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001058 struct_size = sizeof(PyASCIIObject);
1059 else
1060 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001061 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001062
Victor Stinnerfe226c02011-10-03 03:52:20 +02001063 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1064 PyErr_NoMemory();
1065 return NULL;
1066 }
1067 new_size = (struct_size + (length + 1) * char_size);
1068
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001069 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1070 PyObject_DEL(_PyUnicode_UTF8(unicode));
1071 _PyUnicode_UTF8(unicode) = NULL;
1072 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1073 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001074#ifdef Py_REF_DEBUG
1075 _Py_RefTotal--;
1076#endif
1077#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001078 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001079#endif
Victor Stinner84def372011-12-11 20:04:56 +01001080
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001081 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001082 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001083 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001084 PyErr_NoMemory();
1085 return NULL;
1086 }
Victor Stinner84def372011-12-11 20:04:56 +01001087 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001088 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001089
Victor Stinnerfe226c02011-10-03 03:52:20 +02001090 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001091 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001092 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001093 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001094 _PyUnicode_WSTR_LENGTH(unicode) = length;
1095 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001096 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1097 PyObject_DEL(_PyUnicode_WSTR(unicode));
1098 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001099 if (!PyUnicode_IS_ASCII(unicode))
1100 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001101 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001102#ifdef Py_DEBUG
1103 unicode_fill_invalid(unicode, old_length);
1104#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001105 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1106 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001107 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001108 return unicode;
1109}
1110
Alexander Belopolsky40018472011-02-26 01:02:56 +00001111static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001112resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113{
Victor Stinner95663112011-10-04 01:03:50 +02001114 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001115 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001116 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001117 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001118
Victor Stinnerfe226c02011-10-03 03:52:20 +02001119 if (PyUnicode_IS_READY(unicode)) {
1120 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001121 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001122 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001123#ifdef Py_DEBUG
1124 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1125#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001126
1127 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001128 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001129 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1130 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001131
1132 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1133 PyErr_NoMemory();
1134 return -1;
1135 }
1136 new_size = (length + 1) * char_size;
1137
Victor Stinner7a9105a2011-12-12 00:13:42 +01001138 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1139 {
1140 PyObject_DEL(_PyUnicode_UTF8(unicode));
1141 _PyUnicode_UTF8(unicode) = NULL;
1142 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1143 }
1144
Victor Stinnerfe226c02011-10-03 03:52:20 +02001145 data = (PyObject *)PyObject_REALLOC(data, new_size);
1146 if (data == NULL) {
1147 PyErr_NoMemory();
1148 return -1;
1149 }
1150 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001151 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001152 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001153 _PyUnicode_WSTR_LENGTH(unicode) = length;
1154 }
1155 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001156 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001157 _PyUnicode_UTF8_LENGTH(unicode) = length;
1158 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001159 _PyUnicode_LENGTH(unicode) = length;
1160 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001161#ifdef Py_DEBUG
1162 unicode_fill_invalid(unicode, old_length);
1163#endif
Victor Stinner95663112011-10-04 01:03:50 +02001164 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001165 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001166 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001167 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001168 }
Victor Stinner95663112011-10-04 01:03:50 +02001169 assert(_PyUnicode_WSTR(unicode) != NULL);
1170
1171 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001172 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001173 PyErr_NoMemory();
1174 return -1;
1175 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001176 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001177 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001178 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001179 if (!wstr) {
1180 PyErr_NoMemory();
1181 return -1;
1182 }
1183 _PyUnicode_WSTR(unicode) = wstr;
1184 _PyUnicode_WSTR(unicode)[length] = 0;
1185 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001186 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187 return 0;
1188}
1189
Victor Stinnerfe226c02011-10-03 03:52:20 +02001190static PyObject*
1191resize_copy(PyObject *unicode, Py_ssize_t length)
1192{
1193 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001194 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001195 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001196
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001197 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001198
1199 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1200 if (copy == NULL)
1201 return NULL;
1202
1203 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001204 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001205 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001206 }
1207 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001208 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001209
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001210 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001211 if (w == NULL)
1212 return NULL;
1213 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1214 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001215 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001216 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001217 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001218 }
1219}
1220
Guido van Rossumd57fd912000-03-10 22:53:23 +00001221/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001222 Ux0000 terminated; some code (e.g. new_identifier)
1223 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224
1225 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001226 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001227
1228*/
1229
Alexander Belopolsky40018472011-02-26 01:02:56 +00001230static PyUnicodeObject *
1231_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001232{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001233 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001234 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235
Thomas Wouters477c8d52006-05-27 19:21:47 +00001236 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001237 if (length == 0 && unicode_empty != NULL) {
1238 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001239 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 }
1241
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001242 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001243 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001244 return (PyUnicodeObject *)PyErr_NoMemory();
1245 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001246 if (length < 0) {
1247 PyErr_SetString(PyExc_SystemError,
1248 "Negative size passed to _PyUnicode_New");
1249 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001250 }
1251
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001252 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1253 if (unicode == NULL)
1254 return NULL;
1255 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001256
1257 _PyUnicode_WSTR_LENGTH(unicode) = length;
1258 _PyUnicode_HASH(unicode) = -1;
1259 _PyUnicode_STATE(unicode).interned = 0;
1260 _PyUnicode_STATE(unicode).kind = 0;
1261 _PyUnicode_STATE(unicode).compact = 0;
1262 _PyUnicode_STATE(unicode).ready = 0;
1263 _PyUnicode_STATE(unicode).ascii = 0;
1264 _PyUnicode_DATA_ANY(unicode) = NULL;
1265 _PyUnicode_LENGTH(unicode) = 0;
1266 _PyUnicode_UTF8(unicode) = NULL;
1267 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1268
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001269 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1270 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001271 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001272 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001273 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001274 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001275
Jeremy Hyltond8082792003-09-16 19:41:39 +00001276 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001277 * the caller fails before initializing str -- unicode_resize()
1278 * reads str[0], and the Keep-Alive optimization can keep memory
1279 * allocated for str alive across a call to unicode_dealloc(unicode).
1280 * We don't want unicode_resize to read uninitialized memory in
1281 * that case.
1282 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001283 _PyUnicode_WSTR(unicode)[0] = 0;
1284 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001285
Victor Stinner7931d9a2011-11-04 00:22:48 +01001286 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001287 return unicode;
1288}
1289
Victor Stinnerf42dc442011-10-02 23:33:16 +02001290static const char*
1291unicode_kind_name(PyObject *unicode)
1292{
Victor Stinner42dfd712011-10-03 14:41:45 +02001293 /* don't check consistency: unicode_kind_name() is called from
1294 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001295 if (!PyUnicode_IS_COMPACT(unicode))
1296 {
1297 if (!PyUnicode_IS_READY(unicode))
1298 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001299 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001300 {
1301 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001302 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001303 return "legacy ascii";
1304 else
1305 return "legacy latin1";
1306 case PyUnicode_2BYTE_KIND:
1307 return "legacy UCS2";
1308 case PyUnicode_4BYTE_KIND:
1309 return "legacy UCS4";
1310 default:
1311 return "<legacy invalid kind>";
1312 }
1313 }
1314 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001315 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001316 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001317 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001318 return "ascii";
1319 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001320 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001321 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001322 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001323 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001324 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001325 default:
1326 return "<invalid compact kind>";
1327 }
1328}
1329
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001331/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001332const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001333 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001334 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001335}
1336
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001337const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001338 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001339 return _PyUnicode_COMPACT_DATA(unicode);
1340}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001341const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001342 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001343 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1345 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1346 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1347 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1348 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1349 return PyUnicode_DATA(unicode);
1350}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001351
1352void
1353_PyUnicode_Dump(PyObject *op)
1354{
1355 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001356 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1357 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001358 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001359
Victor Stinnera849a4b2011-10-03 12:12:11 +02001360 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001361 {
1362 if (ascii->state.ascii)
1363 data = (ascii + 1);
1364 else
1365 data = (compact + 1);
1366 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001367 else
1368 data = unicode->data.any;
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001369 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001370
Victor Stinnera849a4b2011-10-03 12:12:11 +02001371 if (ascii->wstr == data)
1372 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001373 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001374
Victor Stinnera3b334d2011-10-03 13:53:37 +02001375 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001376 printf(" (%zu), ", compact->wstr_length);
1377 if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001378 printf("shared ");
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001379 }
1380 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001381 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001382 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001383}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384#endif
1385
1386PyObject *
1387PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1388{
1389 PyObject *obj;
1390 PyCompactUnicodeObject *unicode;
1391 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001392 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001393 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394 Py_ssize_t char_size;
1395 Py_ssize_t struct_size;
1396
1397 /* Optimization for empty strings */
1398 if (size == 0 && unicode_empty != NULL) {
1399 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001400 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401 }
1402
Victor Stinner9e9d6892011-10-04 01:02:02 +02001403 is_ascii = 0;
1404 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405 struct_size = sizeof(PyCompactUnicodeObject);
1406 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001407 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 char_size = 1;
1409 is_ascii = 1;
1410 struct_size = sizeof(PyASCIIObject);
1411 }
1412 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001413 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414 char_size = 1;
1415 }
1416 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001417 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001418 char_size = 2;
1419 if (sizeof(wchar_t) == 2)
1420 is_sharing = 1;
1421 }
1422 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001423 if (maxchar > MAX_UNICODE) {
1424 PyErr_SetString(PyExc_SystemError,
1425 "invalid maximum character passed to PyUnicode_New");
1426 return NULL;
1427 }
Victor Stinner8f825062012-04-27 13:55:39 +02001428 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001429 char_size = 4;
1430 if (sizeof(wchar_t) == 4)
1431 is_sharing = 1;
1432 }
1433
1434 /* Ensure we won't overflow the size. */
1435 if (size < 0) {
1436 PyErr_SetString(PyExc_SystemError,
1437 "Negative size passed to PyUnicode_New");
1438 return NULL;
1439 }
1440 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1441 return PyErr_NoMemory();
1442
1443 /* Duplicated allocation code from _PyObject_New() instead of a call to
1444 * PyObject_New() so we are able to allocate space for the object and
1445 * it's data buffer.
1446 */
1447 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
Victor Stinner04fc4f22020-06-16 01:28:07 +02001448 if (obj == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001449 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02001450 }
1451 _PyObject_Init(obj, &PyUnicode_Type);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001452
1453 unicode = (PyCompactUnicodeObject *)obj;
1454 if (is_ascii)
1455 data = ((PyASCIIObject*)obj) + 1;
1456 else
1457 data = unicode + 1;
1458 _PyUnicode_LENGTH(unicode) = size;
1459 _PyUnicode_HASH(unicode) = -1;
1460 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001461 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 _PyUnicode_STATE(unicode).compact = 1;
1463 _PyUnicode_STATE(unicode).ready = 1;
1464 _PyUnicode_STATE(unicode).ascii = is_ascii;
1465 if (is_ascii) {
1466 ((char*)data)[size] = 0;
1467 _PyUnicode_WSTR(unicode) = NULL;
1468 }
Victor Stinner8f825062012-04-27 13:55:39 +02001469 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470 ((char*)data)[size] = 0;
1471 _PyUnicode_WSTR(unicode) = NULL;
1472 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001474 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001475 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 else {
1477 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001478 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001479 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001480 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001481 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001482 ((Py_UCS4*)data)[size] = 0;
1483 if (is_sharing) {
1484 _PyUnicode_WSTR_LENGTH(unicode) = size;
1485 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1486 }
1487 else {
1488 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1489 _PyUnicode_WSTR(unicode) = NULL;
1490 }
1491 }
Victor Stinner8f825062012-04-27 13:55:39 +02001492#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001493 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001494#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001495 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001496 return obj;
1497}
1498
1499#if SIZEOF_WCHAR_T == 2
1500/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1501 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001502 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001503
1504 This function assumes that unicode can hold one more code point than wstr
1505 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001506static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001507unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001508 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001509{
1510 const wchar_t *iter;
1511 Py_UCS4 *ucs4_out;
1512
Victor Stinner910337b2011-10-03 03:20:16 +02001513 assert(unicode != NULL);
1514 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001515 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1516 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1517
1518 for (iter = begin; iter < end; ) {
1519 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1520 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001521 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1522 && (iter+1) < end
1523 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001524 {
Victor Stinner551ac952011-11-29 22:58:13 +01001525 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001526 iter += 2;
1527 }
1528 else {
1529 *ucs4_out++ = *iter;
1530 iter++;
1531 }
1532 }
1533 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1534 _PyUnicode_GET_LENGTH(unicode)));
1535
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001536}
1537#endif
1538
Victor Stinnercd9950f2011-10-02 00:34:53 +02001539static int
Victor Stinner488fa492011-12-12 00:01:39 +01001540unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001541{
Victor Stinner488fa492011-12-12 00:01:39 +01001542 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001543 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001544 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001545 return -1;
1546 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001547 return 0;
1548}
1549
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001550static int
1551_copy_characters(PyObject *to, Py_ssize_t to_start,
1552 PyObject *from, Py_ssize_t from_start,
1553 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001554{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001555 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001556 const void *from_data;
1557 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001558
Victor Stinneree4544c2012-05-09 22:24:08 +02001559 assert(0 <= how_many);
1560 assert(0 <= from_start);
1561 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001562 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001563 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001564 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001565
Victor Stinnerd3f08822012-05-29 12:57:52 +02001566 assert(PyUnicode_Check(to));
1567 assert(PyUnicode_IS_READY(to));
1568 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1569
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001570 if (how_many == 0)
1571 return 0;
1572
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001573 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001574 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001575 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001576 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001577
Victor Stinnerf1852262012-06-16 16:38:26 +02001578#ifdef Py_DEBUG
1579 if (!check_maxchar
1580 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1581 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001582 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001583 Py_UCS4 ch;
1584 Py_ssize_t i;
1585 for (i=0; i < how_many; i++) {
1586 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1587 assert(ch <= to_maxchar);
1588 }
1589 }
1590#endif
1591
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001592 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001593 if (check_maxchar
1594 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1595 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001596 /* Writing Latin-1 characters into an ASCII string requires to
1597 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001598 Py_UCS4 max_char;
1599 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001600 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001601 if (max_char >= 128)
1602 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001603 }
Christian Heimesf051e432016-09-13 20:22:02 +02001604 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001605 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001606 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001607 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001608 else if (from_kind == PyUnicode_1BYTE_KIND
1609 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001610 {
1611 _PyUnicode_CONVERT_BYTES(
1612 Py_UCS1, Py_UCS2,
1613 PyUnicode_1BYTE_DATA(from) + from_start,
1614 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1615 PyUnicode_2BYTE_DATA(to) + to_start
1616 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001617 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001618 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001619 && to_kind == PyUnicode_4BYTE_KIND)
1620 {
1621 _PyUnicode_CONVERT_BYTES(
1622 Py_UCS1, Py_UCS4,
1623 PyUnicode_1BYTE_DATA(from) + from_start,
1624 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1625 PyUnicode_4BYTE_DATA(to) + to_start
1626 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001627 }
1628 else if (from_kind == PyUnicode_2BYTE_KIND
1629 && to_kind == PyUnicode_4BYTE_KIND)
1630 {
1631 _PyUnicode_CONVERT_BYTES(
1632 Py_UCS2, Py_UCS4,
1633 PyUnicode_2BYTE_DATA(from) + from_start,
1634 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1635 PyUnicode_4BYTE_DATA(to) + to_start
1636 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001637 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001638 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001639 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1640
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001641 if (!check_maxchar) {
1642 if (from_kind == PyUnicode_2BYTE_KIND
1643 && to_kind == PyUnicode_1BYTE_KIND)
1644 {
1645 _PyUnicode_CONVERT_BYTES(
1646 Py_UCS2, Py_UCS1,
1647 PyUnicode_2BYTE_DATA(from) + from_start,
1648 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1649 PyUnicode_1BYTE_DATA(to) + to_start
1650 );
1651 }
1652 else if (from_kind == PyUnicode_4BYTE_KIND
1653 && to_kind == PyUnicode_1BYTE_KIND)
1654 {
1655 _PyUnicode_CONVERT_BYTES(
1656 Py_UCS4, Py_UCS1,
1657 PyUnicode_4BYTE_DATA(from) + from_start,
1658 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1659 PyUnicode_1BYTE_DATA(to) + to_start
1660 );
1661 }
1662 else if (from_kind == PyUnicode_4BYTE_KIND
1663 && to_kind == PyUnicode_2BYTE_KIND)
1664 {
1665 _PyUnicode_CONVERT_BYTES(
1666 Py_UCS4, Py_UCS2,
1667 PyUnicode_4BYTE_DATA(from) + from_start,
1668 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1669 PyUnicode_2BYTE_DATA(to) + to_start
1670 );
1671 }
1672 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001673 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001674 }
1675 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001676 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001677 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001678 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001679 Py_ssize_t i;
1680
Victor Stinnera0702ab2011-09-29 14:14:38 +02001681 for (i=0; i < how_many; i++) {
1682 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001683 if (ch > to_maxchar)
1684 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001685 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1686 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001687 }
1688 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001689 return 0;
1690}
1691
Victor Stinnerd3f08822012-05-29 12:57:52 +02001692void
1693_PyUnicode_FastCopyCharacters(
1694 PyObject *to, Py_ssize_t to_start,
1695 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001696{
1697 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1698}
1699
1700Py_ssize_t
1701PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1702 PyObject *from, Py_ssize_t from_start,
1703 Py_ssize_t how_many)
1704{
1705 int err;
1706
1707 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1708 PyErr_BadInternalCall();
1709 return -1;
1710 }
1711
Benjamin Petersonbac79492012-01-14 13:34:47 -05001712 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001713 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001714 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001715 return -1;
1716
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001717 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001718 PyErr_SetString(PyExc_IndexError, "string index out of range");
1719 return -1;
1720 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001721 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001722 PyErr_SetString(PyExc_IndexError, "string index out of range");
1723 return -1;
1724 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001725 if (how_many < 0) {
1726 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1727 return -1;
1728 }
1729 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001730 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1731 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001732 "Cannot write %zi characters at %zi "
1733 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001734 how_many, to_start, PyUnicode_GET_LENGTH(to));
1735 return -1;
1736 }
1737
1738 if (how_many == 0)
1739 return 0;
1740
Victor Stinner488fa492011-12-12 00:01:39 +01001741 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001742 return -1;
1743
1744 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1745 if (err) {
1746 PyErr_Format(PyExc_SystemError,
1747 "Cannot copy %s characters "
1748 "into a string of %s characters",
1749 unicode_kind_name(from),
1750 unicode_kind_name(to));
1751 return -1;
1752 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001753 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754}
1755
Victor Stinner17222162011-09-28 22:15:37 +02001756/* Find the maximum code point and count the number of surrogate pairs so a
1757 correct string length can be computed before converting a string to UCS4.
1758 This function counts single surrogates as a character and not as a pair.
1759
1760 Return 0 on success, or -1 on error. */
1761static int
1762find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1763 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001764{
1765 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001766 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001767
Victor Stinnerc53be962011-10-02 21:33:54 +02001768 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769 *num_surrogates = 0;
1770 *maxchar = 0;
1771
1772 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001774 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1775 && (iter+1) < end
1776 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1777 {
1778 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1779 ++(*num_surrogates);
1780 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781 }
1782 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001783#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001784 {
1785 ch = *iter;
1786 iter++;
1787 }
1788 if (ch > *maxchar) {
1789 *maxchar = ch;
1790 if (*maxchar > MAX_UNICODE) {
1791 PyErr_Format(PyExc_ValueError,
1792 "character U+%x is not in range [U+0000; U+10ffff]",
1793 ch);
1794 return -1;
1795 }
1796 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001797 }
1798 return 0;
1799}
1800
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001801int
1802_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803{
1804 wchar_t *end;
1805 Py_UCS4 maxchar = 0;
1806 Py_ssize_t num_surrogates;
1807#if SIZEOF_WCHAR_T == 2
1808 Py_ssize_t length_wo_surrogates;
1809#endif
1810
Georg Brandl7597add2011-10-05 16:36:47 +02001811 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001812 strings were created using _PyObject_New() and where no canonical
1813 representation (the str field) has been set yet aka strings
1814 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001815 assert(_PyUnicode_CHECK(unicode));
1816 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001817 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001818 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001819 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001820 /* Actually, it should neither be interned nor be anything else: */
1821 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001822
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001824 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001825 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827
1828 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001829 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1830 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 PyErr_NoMemory();
1832 return -1;
1833 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001834 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835 _PyUnicode_WSTR(unicode), end,
1836 PyUnicode_1BYTE_DATA(unicode));
1837 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1838 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1839 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1840 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001841 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001842 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001843 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001844 }
1845 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001846 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001847 _PyUnicode_UTF8(unicode) = NULL;
1848 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001849 }
1850 PyObject_FREE(_PyUnicode_WSTR(unicode));
1851 _PyUnicode_WSTR(unicode) = NULL;
1852 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1853 }
1854 /* In this case we might have to convert down from 4-byte native
1855 wchar_t to 2-byte unicode. */
1856 else if (maxchar < 65536) {
1857 assert(num_surrogates == 0 &&
1858 "FindMaxCharAndNumSurrogatePairs() messed up");
1859
Victor Stinner506f5922011-09-28 22:34:18 +02001860#if SIZEOF_WCHAR_T == 2
1861 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001862 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001863 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1864 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1865 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001866 _PyUnicode_UTF8(unicode) = NULL;
1867 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001868#else
1869 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001870 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001871 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001872 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001873 PyErr_NoMemory();
1874 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001875 }
Victor Stinner506f5922011-09-28 22:34:18 +02001876 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1877 _PyUnicode_WSTR(unicode), end,
1878 PyUnicode_2BYTE_DATA(unicode));
1879 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1880 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1881 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001882 _PyUnicode_UTF8(unicode) = NULL;
1883 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001884 PyObject_FREE(_PyUnicode_WSTR(unicode));
1885 _PyUnicode_WSTR(unicode) = NULL;
1886 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1887#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001888 }
1889 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1890 else {
1891#if SIZEOF_WCHAR_T == 2
1892 /* in case the native representation is 2-bytes, we need to allocate a
1893 new normalized 4-byte version. */
1894 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001895 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1896 PyErr_NoMemory();
1897 return -1;
1898 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001899 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1900 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001901 PyErr_NoMemory();
1902 return -1;
1903 }
1904 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1905 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001906 _PyUnicode_UTF8(unicode) = NULL;
1907 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001908 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1909 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001910 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001911 PyObject_FREE(_PyUnicode_WSTR(unicode));
1912 _PyUnicode_WSTR(unicode) = NULL;
1913 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1914#else
1915 assert(num_surrogates == 0);
1916
Victor Stinnerc3c74152011-10-02 20:39:55 +02001917 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001918 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001919 _PyUnicode_UTF8(unicode) = NULL;
1920 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001921 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1922#endif
1923 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1924 }
1925 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001926 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001927 return 0;
1928}
1929
Alexander Belopolsky40018472011-02-26 01:02:56 +00001930static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001931unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932{
Walter Dörwald16807132007-05-25 13:52:07 +00001933 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001934 case SSTATE_NOT_INTERNED:
1935 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001936
Benjamin Peterson29060642009-01-31 22:14:21 +00001937 case SSTATE_INTERNED_MORTAL:
1938 /* revive dead object temporarily for DelItem */
Victor Stinnerc86a1122020-02-07 01:24:29 +01001939 Py_SET_REFCNT(unicode, 3);
Victor Stinner607b1022020-05-05 18:50:30 +02001940#ifdef INTERNED_STRINGS
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001941 if (PyDict_DelItem(interned, unicode) != 0) {
1942 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1943 NULL);
1944 }
Victor Stinner607b1022020-05-05 18:50:30 +02001945#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001946 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001947
Benjamin Peterson29060642009-01-31 22:14:21 +00001948 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001949 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1950 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001951
Benjamin Peterson29060642009-01-31 22:14:21 +00001952 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001953 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001954 }
1955
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001956 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001957 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001958 }
1959 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001960 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001961 }
1962 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001963 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001964 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001965
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001966 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001967}
1968
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001969#ifdef Py_DEBUG
1970static int
1971unicode_is_singleton(PyObject *unicode)
1972{
Victor Stinner607b1022020-05-05 18:50:30 +02001973 if (unicode == unicode_empty) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001974 return 1;
Victor Stinner607b1022020-05-05 18:50:30 +02001975 }
1976#ifdef LATIN1_SINGLETONS
1977 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001978 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1979 {
1980 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1981 if (ch < 256 && unicode_latin1[ch] == unicode)
1982 return 1;
1983 }
Victor Stinner607b1022020-05-05 18:50:30 +02001984#endif
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001985 return 0;
1986}
1987#endif
1988
Alexander Belopolsky40018472011-02-26 01:02:56 +00001989static int
Victor Stinner488fa492011-12-12 00:01:39 +01001990unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001991{
Victor Stinner488fa492011-12-12 00:01:39 +01001992 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001993 if (Py_REFCNT(unicode) != 1)
1994 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001995 if (_PyUnicode_HASH(unicode) != -1)
1996 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001997 if (PyUnicode_CHECK_INTERNED(unicode))
1998 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001999 if (!PyUnicode_CheckExact(unicode))
2000 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02002001#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002002 /* singleton refcount is greater than 1 */
2003 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02002004#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02002005 return 1;
2006}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002007
Victor Stinnerfe226c02011-10-03 03:52:20 +02002008static int
2009unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2010{
2011 PyObject *unicode;
2012 Py_ssize_t old_length;
2013
2014 assert(p_unicode != NULL);
2015 unicode = *p_unicode;
2016
2017 assert(unicode != NULL);
2018 assert(PyUnicode_Check(unicode));
2019 assert(0 <= length);
2020
Victor Stinner910337b2011-10-03 03:20:16 +02002021 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002022 old_length = PyUnicode_WSTR_LENGTH(unicode);
2023 else
2024 old_length = PyUnicode_GET_LENGTH(unicode);
2025 if (old_length == length)
2026 return 0;
2027
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002028 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02002029 _Py_INCREF_UNICODE_EMPTY();
2030 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00002031 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002032 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002033 return 0;
2034 }
2035
Victor Stinner488fa492011-12-12 00:01:39 +01002036 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002037 PyObject *copy = resize_copy(unicode, length);
2038 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002039 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002040 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002041 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002042 }
2043
Victor Stinnerfe226c02011-10-03 03:52:20 +02002044 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002045 PyObject *new_unicode = resize_compact(unicode, length);
2046 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002047 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002048 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002049 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002050 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002051 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002052}
2053
Alexander Belopolsky40018472011-02-26 01:02:56 +00002054int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002055PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002056{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002057 PyObject *unicode;
2058 if (p_unicode == NULL) {
2059 PyErr_BadInternalCall();
2060 return -1;
2061 }
2062 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002063 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002064 {
2065 PyErr_BadInternalCall();
2066 return -1;
2067 }
2068 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002069}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002070
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002071/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002072
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002073 WARNING: The function doesn't copy the terminating null character and
2074 doesn't check the maximum character (may write a latin1 character in an
2075 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002076static void
2077unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2078 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002079{
2080 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002081 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002082 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002083
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002084 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002085 switch (kind) {
2086 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002087#ifdef Py_DEBUG
2088 if (PyUnicode_IS_ASCII(unicode)) {
2089 Py_UCS4 maxchar = ucs1lib_find_max_char(
2090 (const Py_UCS1*)str,
2091 (const Py_UCS1*)str + len);
2092 assert(maxchar < 128);
2093 }
2094#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002095 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002096 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002097 }
2098 case PyUnicode_2BYTE_KIND: {
2099 Py_UCS2 *start = (Py_UCS2 *)data + index;
2100 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002101
Victor Stinner184252a2012-06-16 02:57:41 +02002102 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002103 *ucs2 = (Py_UCS2)*str;
2104
2105 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002106 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002107 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002108 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002109 Py_UCS4 *start = (Py_UCS4 *)data + index;
2110 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002111
Victor Stinner184252a2012-06-16 02:57:41 +02002112 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002113 *ucs4 = (Py_UCS4)*str;
2114
2115 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002116 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002117 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002118 default:
2119 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002120 }
2121}
2122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002123static PyObject*
2124get_latin1_char(unsigned char ch)
2125{
Victor Stinner607b1022020-05-05 18:50:30 +02002126 PyObject *unicode;
2127
2128#ifdef LATIN1_SINGLETONS
2129 unicode = unicode_latin1[ch];
2130 if (unicode) {
2131 Py_INCREF(unicode);
2132 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002133 }
Victor Stinner607b1022020-05-05 18:50:30 +02002134#endif
2135
2136 unicode = PyUnicode_New(1, ch);
2137 if (!unicode) {
2138 return NULL;
2139 }
2140
2141 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2142 assert(_PyUnicode_CheckConsistency(unicode, 1));
2143
2144#ifdef LATIN1_SINGLETONS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002145 Py_INCREF(unicode);
Victor Stinner607b1022020-05-05 18:50:30 +02002146 unicode_latin1[ch] = unicode;
2147#endif
Victor Stinnera464fc12011-10-02 20:39:30 +02002148 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002149}
2150
Victor Stinner985a82a2014-01-03 12:53:47 +01002151static PyObject*
2152unicode_char(Py_UCS4 ch)
2153{
2154 PyObject *unicode;
2155
2156 assert(ch <= MAX_UNICODE);
2157
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002158 if (ch < 256)
2159 return get_latin1_char(ch);
2160
Victor Stinner985a82a2014-01-03 12:53:47 +01002161 unicode = PyUnicode_New(1, ch);
2162 if (unicode == NULL)
2163 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002164
2165 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2166 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002167 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002168 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002169 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2170 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2171 }
2172 assert(_PyUnicode_CheckConsistency(unicode, 1));
2173 return unicode;
2174}
2175
Alexander Belopolsky40018472011-02-26 01:02:56 +00002176PyObject *
2177PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002178{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002179 if (u == NULL)
2180 return (PyObject*)_PyUnicode_New(size);
2181
2182 if (size < 0) {
2183 PyErr_BadInternalCall();
2184 return NULL;
2185 }
2186
2187 return PyUnicode_FromWideChar(u, size);
2188}
2189
2190PyObject *
2191PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2192{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002193 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 Py_UCS4 maxchar = 0;
2195 Py_ssize_t num_surrogates;
2196
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002197 if (u == NULL && size != 0) {
2198 PyErr_BadInternalCall();
2199 return NULL;
2200 }
2201
2202 if (size == -1) {
2203 size = wcslen(u);
2204 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002205
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002206 /* If the Unicode data is known at construction time, we can apply
2207 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002208
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002210 if (size == 0)
2211 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002213 /* Single character Unicode objects in the Latin-1 range are
2214 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002215 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 return get_latin1_char((unsigned char)*u);
2217
2218 /* If not empty and not single character, copy the Unicode data
2219 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002220 if (find_maxchar_surrogates(u, u + size,
2221 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002222 return NULL;
2223
Victor Stinner8faf8212011-12-08 22:14:11 +01002224 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002225 if (!unicode)
2226 return NULL;
2227
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002228 switch (PyUnicode_KIND(unicode)) {
2229 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002230 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2232 break;
2233 case PyUnicode_2BYTE_KIND:
2234#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002235 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002236#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002237 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2239#endif
2240 break;
2241 case PyUnicode_4BYTE_KIND:
2242#if SIZEOF_WCHAR_T == 2
2243 /* This is the only case which has to process surrogates, thus
2244 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002245 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246#else
2247 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002248 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002249#endif
2250 break;
2251 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002252 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002253 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002254
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002255 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002256}
2257
Alexander Belopolsky40018472011-02-26 01:02:56 +00002258PyObject *
2259PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002260{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002261 if (size < 0) {
2262 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002263 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002264 return NULL;
2265 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002266 if (u != NULL)
2267 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2268 else
2269 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002270}
2271
Alexander Belopolsky40018472011-02-26 01:02:56 +00002272PyObject *
2273PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002274{
2275 size_t size = strlen(u);
2276 if (size > PY_SSIZE_T_MAX) {
2277 PyErr_SetString(PyExc_OverflowError, "input too long");
2278 return NULL;
2279 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002280 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002281}
2282
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002283PyObject *
2284_PyUnicode_FromId(_Py_Identifier *id)
2285{
Victor Stinner297257f2020-06-02 14:39:45 +02002286 if (id->object) {
2287 return id->object;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002288 }
Victor Stinner297257f2020-06-02 14:39:45 +02002289
2290 PyObject *obj;
2291 obj = PyUnicode_DecodeUTF8Stateful(id->string,
2292 strlen(id->string),
2293 NULL, NULL);
2294 if (!obj) {
2295 return NULL;
2296 }
2297 PyUnicode_InternInPlace(&obj);
2298
2299 assert(!id->next);
2300 id->object = obj;
2301 id->next = static_strings;
2302 static_strings = id;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002303 return id->object;
2304}
2305
Victor Stinnerd6fb53f2020-05-14 01:11:54 +02002306static void
2307unicode_clear_static_strings(void)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002308{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002309 _Py_Identifier *tmp, *s = static_strings;
2310 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002311 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002312 tmp = s->next;
2313 s->next = NULL;
2314 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002315 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002316 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002317}
2318
Benjamin Peterson0df54292012-03-26 14:50:32 -04002319/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002320
Victor Stinnerd3f08822012-05-29 12:57:52 +02002321PyObject*
2322_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002323{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002324 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002325 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002326 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002327#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002328 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002329#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002330 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002331 }
Victor Stinner785938e2011-12-11 20:09:03 +01002332 unicode = PyUnicode_New(size, 127);
2333 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002334 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002335 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2336 assert(_PyUnicode_CheckConsistency(unicode, 1));
2337 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002338}
2339
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002340static Py_UCS4
2341kind_maxchar_limit(unsigned int kind)
2342{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002343 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002344 case PyUnicode_1BYTE_KIND:
2345 return 0x80;
2346 case PyUnicode_2BYTE_KIND:
2347 return 0x100;
2348 case PyUnicode_4BYTE_KIND:
2349 return 0x10000;
2350 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002351 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002352 }
2353}
2354
Victor Stinner702c7342011-10-05 13:50:52 +02002355static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002356_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002357{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002358 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002359 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002360
Serhiy Storchaka678db842013-01-26 12:16:36 +02002361 if (size == 0)
2362 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002363 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002364 if (size == 1)
2365 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002366
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002367 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002368 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002369 if (!res)
2370 return NULL;
2371 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002372 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002373 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002374}
2375
Victor Stinnere57b1c02011-09-28 22:20:48 +02002376static PyObject*
2377_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002378{
2379 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002380 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002381
Serhiy Storchaka678db842013-01-26 12:16:36 +02002382 if (size == 0)
2383 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002384 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002385 if (size == 1)
2386 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002387
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002388 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002389 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002390 if (!res)
2391 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002392 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002393 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002394 else {
2395 _PyUnicode_CONVERT_BYTES(
2396 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2397 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002398 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002399 return res;
2400}
2401
Victor Stinnere57b1c02011-09-28 22:20:48 +02002402static PyObject*
2403_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002404{
2405 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002406 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002407
Serhiy Storchaka678db842013-01-26 12:16:36 +02002408 if (size == 0)
2409 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002410 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002411 if (size == 1)
2412 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002413
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002414 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002415 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002416 if (!res)
2417 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002418 if (max_char < 256)
2419 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2420 PyUnicode_1BYTE_DATA(res));
2421 else if (max_char < 0x10000)
2422 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2423 PyUnicode_2BYTE_DATA(res));
2424 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002425 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002426 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002427 return res;
2428}
2429
2430PyObject*
2431PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2432{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002433 if (size < 0) {
2434 PyErr_SetString(PyExc_ValueError, "size must be positive");
2435 return NULL;
2436 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002437 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002438 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002439 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002440 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002441 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002442 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002443 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002444 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002445 PyErr_SetString(PyExc_SystemError, "invalid kind");
2446 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002448}
2449
Victor Stinnerece58de2012-04-23 23:36:38 +02002450Py_UCS4
2451_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2452{
2453 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002454 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002455
2456 assert(PyUnicode_IS_READY(unicode));
2457 assert(0 <= start);
2458 assert(end <= PyUnicode_GET_LENGTH(unicode));
2459 assert(start <= end);
2460
2461 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2462 return PyUnicode_MAX_CHAR_VALUE(unicode);
2463
2464 if (start == end)
2465 return 127;
2466
Victor Stinner94d558b2012-04-27 22:26:58 +02002467 if (PyUnicode_IS_ASCII(unicode))
2468 return 127;
2469
Victor Stinnerece58de2012-04-23 23:36:38 +02002470 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002471 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002472 endptr = (char *)startptr + end * kind;
2473 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002474 switch(kind) {
2475 case PyUnicode_1BYTE_KIND:
2476 return ucs1lib_find_max_char(startptr, endptr);
2477 case PyUnicode_2BYTE_KIND:
2478 return ucs2lib_find_max_char(startptr, endptr);
2479 case PyUnicode_4BYTE_KIND:
2480 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002481 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002482 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002483 }
2484}
2485
Victor Stinner25a4b292011-10-06 12:31:55 +02002486/* Ensure that a string uses the most efficient storage, if it is not the
2487 case: create a new string with of the right kind. Write NULL into *p_unicode
2488 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002489static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002490unicode_adjust_maxchar(PyObject **p_unicode)
2491{
2492 PyObject *unicode, *copy;
2493 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002494 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002495 unsigned int kind;
2496
2497 assert(p_unicode != NULL);
2498 unicode = *p_unicode;
2499 assert(PyUnicode_IS_READY(unicode));
2500 if (PyUnicode_IS_ASCII(unicode))
2501 return;
2502
2503 len = PyUnicode_GET_LENGTH(unicode);
2504 kind = PyUnicode_KIND(unicode);
2505 if (kind == PyUnicode_1BYTE_KIND) {
2506 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002507 max_char = ucs1lib_find_max_char(u, u + len);
2508 if (max_char >= 128)
2509 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002510 }
2511 else if (kind == PyUnicode_2BYTE_KIND) {
2512 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002513 max_char = ucs2lib_find_max_char(u, u + len);
2514 if (max_char >= 256)
2515 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002516 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002517 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002518 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002519 max_char = ucs4lib_find_max_char(u, u + len);
2520 if (max_char >= 0x10000)
2521 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002522 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002523 else
2524 Py_UNREACHABLE();
2525
Victor Stinner25a4b292011-10-06 12:31:55 +02002526 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002527 if (copy != NULL)
2528 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002529 Py_DECREF(unicode);
2530 *p_unicode = copy;
2531}
2532
Victor Stinner034f6cf2011-09-30 02:26:44 +02002533PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002534_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002535{
Victor Stinner87af4f22011-11-21 23:03:47 +01002536 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002537 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002538
Victor Stinner034f6cf2011-09-30 02:26:44 +02002539 if (!PyUnicode_Check(unicode)) {
2540 PyErr_BadInternalCall();
2541 return NULL;
2542 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002543 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002544 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002545
Victor Stinner87af4f22011-11-21 23:03:47 +01002546 length = PyUnicode_GET_LENGTH(unicode);
2547 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002548 if (!copy)
2549 return NULL;
2550 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2551
Christian Heimesf051e432016-09-13 20:22:02 +02002552 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002553 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002554 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002555 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002556}
2557
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002558
Victor Stinnerbc603d12011-10-02 01:00:40 +02002559/* Widen Unicode objects to larger buffers. Don't write terminating null
2560 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002561
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002562static void*
2563unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002564{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002565 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002566
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002567 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002568 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002569 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002570 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002571 if (!result)
2572 return PyErr_NoMemory();
2573 assert(skind == PyUnicode_1BYTE_KIND);
2574 _PyUnicode_CONVERT_BYTES(
2575 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002576 (const Py_UCS1 *)data,
2577 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002578 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002579 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002580 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002581 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002582 if (!result)
2583 return PyErr_NoMemory();
2584 if (skind == PyUnicode_2BYTE_KIND) {
2585 _PyUnicode_CONVERT_BYTES(
2586 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002587 (const Py_UCS2 *)data,
2588 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002589 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002590 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002591 else {
2592 assert(skind == PyUnicode_1BYTE_KIND);
2593 _PyUnicode_CONVERT_BYTES(
2594 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002595 (const Py_UCS1 *)data,
2596 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002597 result);
2598 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002599 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002600 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002601 Py_UNREACHABLE();
2602 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002603 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002604}
2605
2606static Py_UCS4*
2607as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2608 int copy_null)
2609{
2610 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002611 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002612 Py_ssize_t len, targetlen;
2613 if (PyUnicode_READY(string) == -1)
2614 return NULL;
2615 kind = PyUnicode_KIND(string);
2616 data = PyUnicode_DATA(string);
2617 len = PyUnicode_GET_LENGTH(string);
2618 targetlen = len;
2619 if (copy_null)
2620 targetlen++;
2621 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002622 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002623 if (!target) {
2624 PyErr_NoMemory();
2625 return NULL;
2626 }
2627 }
2628 else {
2629 if (targetsize < targetlen) {
2630 PyErr_Format(PyExc_SystemError,
2631 "string is longer than the buffer");
2632 if (copy_null && 0 < targetsize)
2633 target[0] = 0;
2634 return NULL;
2635 }
2636 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002637 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002638 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002639 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002640 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002641 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002642 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002643 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2644 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002645 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002646 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002647 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002648 else {
2649 Py_UNREACHABLE();
2650 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002651 if (copy_null)
2652 target[len] = 0;
2653 return target;
2654}
2655
2656Py_UCS4*
2657PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2658 int copy_null)
2659{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002660 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002661 PyErr_BadInternalCall();
2662 return NULL;
2663 }
2664 return as_ucs4(string, target, targetsize, copy_null);
2665}
2666
2667Py_UCS4*
2668PyUnicode_AsUCS4Copy(PyObject *string)
2669{
2670 return as_ucs4(string, NULL, 0, 1);
2671}
2672
Victor Stinner15a11362012-10-06 23:48:20 +02002673/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002674 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2675 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2676#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002677
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002678static int
2679unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2680 Py_ssize_t width, Py_ssize_t precision)
2681{
2682 Py_ssize_t length, fill, arglen;
2683 Py_UCS4 maxchar;
2684
2685 if (PyUnicode_READY(str) == -1)
2686 return -1;
2687
2688 length = PyUnicode_GET_LENGTH(str);
2689 if ((precision == -1 || precision >= length)
2690 && width <= length)
2691 return _PyUnicodeWriter_WriteStr(writer, str);
2692
2693 if (precision != -1)
2694 length = Py_MIN(precision, length);
2695
2696 arglen = Py_MAX(length, width);
2697 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2698 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2699 else
2700 maxchar = writer->maxchar;
2701
2702 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2703 return -1;
2704
2705 if (width > length) {
2706 fill = width - length;
2707 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2708 return -1;
2709 writer->pos += fill;
2710 }
2711
2712 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2713 str, 0, length);
2714 writer->pos += length;
2715 return 0;
2716}
2717
2718static int
Victor Stinner998b8062018-09-12 00:23:25 +02002719unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002720 Py_ssize_t width, Py_ssize_t precision)
2721{
2722 /* UTF-8 */
2723 Py_ssize_t length;
2724 PyObject *unicode;
2725 int res;
2726
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002727 if (precision == -1) {
2728 length = strlen(str);
2729 }
2730 else {
2731 length = 0;
2732 while (length < precision && str[length]) {
2733 length++;
2734 }
2735 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002736 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2737 if (unicode == NULL)
2738 return -1;
2739
2740 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2741 Py_DECREF(unicode);
2742 return res;
2743}
2744
Victor Stinner96865452011-03-01 23:44:09 +00002745static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002746unicode_fromformat_arg(_PyUnicodeWriter *writer,
2747 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002748{
Victor Stinnere215d962012-10-06 23:03:36 +02002749 const char *p;
2750 Py_ssize_t len;
2751 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002752 Py_ssize_t width;
2753 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002754 int longflag;
2755 int longlongflag;
2756 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002757 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002758
2759 p = f;
2760 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002761 zeropad = 0;
2762 if (*f == '0') {
2763 zeropad = 1;
2764 f++;
2765 }
Victor Stinner96865452011-03-01 23:44:09 +00002766
2767 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002768 width = -1;
2769 if (Py_ISDIGIT((unsigned)*f)) {
2770 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002771 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002772 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002773 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002774 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002775 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002776 return NULL;
2777 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002778 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002779 f++;
2780 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002781 }
2782 precision = -1;
2783 if (*f == '.') {
2784 f++;
2785 if (Py_ISDIGIT((unsigned)*f)) {
2786 precision = (*f - '0');
2787 f++;
2788 while (Py_ISDIGIT((unsigned)*f)) {
2789 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2790 PyErr_SetString(PyExc_ValueError,
2791 "precision too big");
2792 return NULL;
2793 }
2794 precision = (precision * 10) + (*f - '0');
2795 f++;
2796 }
2797 }
Victor Stinner96865452011-03-01 23:44:09 +00002798 if (*f == '%') {
2799 /* "%.3%s" => f points to "3" */
2800 f--;
2801 }
2802 }
2803 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002804 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002805 f--;
2806 }
Victor Stinner96865452011-03-01 23:44:09 +00002807
2808 /* Handle %ld, %lu, %lld and %llu. */
2809 longflag = 0;
2810 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002811 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002812 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002813 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002814 longflag = 1;
2815 ++f;
2816 }
Victor Stinner96865452011-03-01 23:44:09 +00002817 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002818 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002819 longlongflag = 1;
2820 f += 2;
2821 }
Victor Stinner96865452011-03-01 23:44:09 +00002822 }
2823 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002824 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002825 size_tflag = 1;
2826 ++f;
2827 }
Victor Stinnere215d962012-10-06 23:03:36 +02002828
2829 if (f[1] == '\0')
2830 writer->overallocate = 0;
2831
2832 switch (*f) {
2833 case 'c':
2834 {
2835 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002836 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002837 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002838 "character argument not in range(0x110000)");
2839 return NULL;
2840 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002841 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002842 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002843 break;
2844 }
2845
2846 case 'i':
2847 case 'd':
2848 case 'u':
2849 case 'x':
2850 {
2851 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002852 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002853 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002854
2855 if (*f == 'u') {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002856 if (longflag) {
2857 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2858 }
2859 else if (longlongflag) {
2860 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2861 }
2862 else if (size_tflag) {
2863 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2864 }
2865 else {
2866 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2867 }
Victor Stinnere215d962012-10-06 23:03:36 +02002868 }
2869 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002870 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002871 }
2872 else {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002873 if (longflag) {
2874 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2875 }
2876 else if (longlongflag) {
2877 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2878 }
2879 else if (size_tflag) {
2880 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2881 }
2882 else {
2883 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2884 }
Victor Stinnere215d962012-10-06 23:03:36 +02002885 }
2886 assert(len >= 0);
2887
Victor Stinnere215d962012-10-06 23:03:36 +02002888 if (precision < len)
2889 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002890
2891 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002892 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2893 return NULL;
2894
Victor Stinnere215d962012-10-06 23:03:36 +02002895 if (width > precision) {
2896 Py_UCS4 fillchar;
2897 fill = width - precision;
2898 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002899 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2900 return NULL;
2901 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002902 }
Victor Stinner15a11362012-10-06 23:48:20 +02002903 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002904 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002905 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2906 return NULL;
2907 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002908 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002909
Victor Stinner4a587072013-11-19 12:54:53 +01002910 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2911 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002912 break;
2913 }
2914
2915 case 'p':
2916 {
2917 char number[MAX_LONG_LONG_CHARS];
2918
2919 len = sprintf(number, "%p", va_arg(*vargs, void*));
2920 assert(len >= 0);
2921
2922 /* %p is ill-defined: ensure leading 0x. */
2923 if (number[1] == 'X')
2924 number[1] = 'x';
2925 else if (number[1] != 'x') {
2926 memmove(number + 2, number,
2927 strlen(number) + 1);
2928 number[0] = '0';
2929 number[1] = 'x';
2930 len += 2;
2931 }
2932
Victor Stinner4a587072013-11-19 12:54:53 +01002933 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002934 return NULL;
2935 break;
2936 }
2937
2938 case 's':
2939 {
2940 /* UTF-8 */
2941 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002942 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002943 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002944 break;
2945 }
2946
2947 case 'U':
2948 {
2949 PyObject *obj = va_arg(*vargs, PyObject *);
2950 assert(obj && _PyUnicode_CHECK(obj));
2951
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002952 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002953 return NULL;
2954 break;
2955 }
2956
2957 case 'V':
2958 {
2959 PyObject *obj = va_arg(*vargs, PyObject *);
2960 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002961 if (obj) {
2962 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002963 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002964 return NULL;
2965 }
2966 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002967 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002968 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002969 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002970 }
2971 break;
2972 }
2973
2974 case 'S':
2975 {
2976 PyObject *obj = va_arg(*vargs, PyObject *);
2977 PyObject *str;
2978 assert(obj);
2979 str = PyObject_Str(obj);
2980 if (!str)
2981 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002982 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002983 Py_DECREF(str);
2984 return NULL;
2985 }
2986 Py_DECREF(str);
2987 break;
2988 }
2989
2990 case 'R':
2991 {
2992 PyObject *obj = va_arg(*vargs, PyObject *);
2993 PyObject *repr;
2994 assert(obj);
2995 repr = PyObject_Repr(obj);
2996 if (!repr)
2997 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002998 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002999 Py_DECREF(repr);
3000 return NULL;
3001 }
3002 Py_DECREF(repr);
3003 break;
3004 }
3005
3006 case 'A':
3007 {
3008 PyObject *obj = va_arg(*vargs, PyObject *);
3009 PyObject *ascii;
3010 assert(obj);
3011 ascii = PyObject_ASCII(obj);
3012 if (!ascii)
3013 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003014 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003015 Py_DECREF(ascii);
3016 return NULL;
3017 }
3018 Py_DECREF(ascii);
3019 break;
3020 }
3021
3022 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02003023 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003024 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003025 break;
3026
3027 default:
3028 /* if we stumble upon an unknown formatting code, copy the rest
3029 of the format string to the output string. (we cannot just
3030 skip the code, since there's no way to know what's in the
3031 argument list) */
3032 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01003033 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003034 return NULL;
3035 f = p+len;
3036 return f;
3037 }
3038
3039 f++;
Victor Stinner96865452011-03-01 23:44:09 +00003040 return f;
3041}
3042
Walter Dörwaldd2034312007-05-18 16:29:38 +00003043PyObject *
3044PyUnicode_FromFormatV(const char *format, va_list vargs)
3045{
Victor Stinnere215d962012-10-06 23:03:36 +02003046 va_list vargs2;
3047 const char *f;
3048 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003049
Victor Stinner8f674cc2013-04-17 23:02:17 +02003050 _PyUnicodeWriter_Init(&writer);
3051 writer.min_length = strlen(format) + 100;
3052 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003053
Benjamin Peterson0c212142016-09-20 20:39:33 -07003054 // Copy varags to be able to pass a reference to a subfunction.
3055 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003056
3057 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003058 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003059 f = unicode_fromformat_arg(&writer, f, &vargs2);
3060 if (f == NULL)
3061 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003062 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003063 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003064 const char *p;
3065 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003066
Victor Stinnere215d962012-10-06 23:03:36 +02003067 p = f;
3068 do
3069 {
3070 if ((unsigned char)*p > 127) {
3071 PyErr_Format(PyExc_ValueError,
3072 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3073 "string, got a non-ASCII byte: 0x%02x",
3074 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003075 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003076 }
3077 p++;
3078 }
3079 while (*p != '\0' && *p != '%');
3080 len = p - f;
3081
3082 if (*p == '\0')
3083 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003084
3085 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003086 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003087
3088 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003089 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003090 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003091 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003092 return _PyUnicodeWriter_Finish(&writer);
3093
3094 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003095 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003096 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003097 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003098}
3099
Walter Dörwaldd2034312007-05-18 16:29:38 +00003100PyObject *
3101PyUnicode_FromFormat(const char *format, ...)
3102{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003103 PyObject* ret;
3104 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003105
3106#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003107 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003108#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003109 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003110#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003111 ret = PyUnicode_FromFormatV(format, vargs);
3112 va_end(vargs);
3113 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003114}
3115
Serhiy Storchakac46db922018-10-23 22:58:24 +03003116static Py_ssize_t
3117unicode_get_widechar_size(PyObject *unicode)
3118{
3119 Py_ssize_t res;
3120
3121 assert(unicode != NULL);
3122 assert(_PyUnicode_CHECK(unicode));
3123
3124 if (_PyUnicode_WSTR(unicode) != NULL) {
3125 return PyUnicode_WSTR_LENGTH(unicode);
3126 }
3127 assert(PyUnicode_IS_READY(unicode));
3128
3129 res = _PyUnicode_LENGTH(unicode);
3130#if SIZEOF_WCHAR_T == 2
3131 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3132 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3133 const Py_UCS4 *end = s + res;
3134 for (; s < end; ++s) {
3135 if (*s > 0xFFFF) {
3136 ++res;
3137 }
3138 }
3139 }
3140#endif
3141 return res;
3142}
3143
3144static void
3145unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3146{
3147 const wchar_t *wstr;
3148
3149 assert(unicode != NULL);
3150 assert(_PyUnicode_CHECK(unicode));
3151
3152 wstr = _PyUnicode_WSTR(unicode);
3153 if (wstr != NULL) {
3154 memcpy(w, wstr, size * sizeof(wchar_t));
3155 return;
3156 }
3157 assert(PyUnicode_IS_READY(unicode));
3158
3159 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3160 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3161 for (; size--; ++s, ++w) {
3162 *w = *s;
3163 }
3164 }
3165 else {
3166#if SIZEOF_WCHAR_T == 4
3167 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3168 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3169 for (; size--; ++s, ++w) {
3170 *w = *s;
3171 }
3172#else
3173 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3174 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3175 for (; size--; ++s, ++w) {
3176 Py_UCS4 ch = *s;
3177 if (ch > 0xFFFF) {
3178 assert(ch <= MAX_UNICODE);
3179 /* encode surrogate pair in this case */
3180 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3181 if (!size--)
3182 break;
3183 *w = Py_UNICODE_LOW_SURROGATE(ch);
3184 }
3185 else {
3186 *w = ch;
3187 }
3188 }
3189#endif
3190 }
3191}
3192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003193#ifdef HAVE_WCHAR_H
3194
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003195/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003196
Victor Stinnerd88d9832011-09-06 02:00:05 +02003197 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003198 character) required to convert the unicode object. Ignore size argument.
3199
Victor Stinnerd88d9832011-09-06 02:00:05 +02003200 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003201 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003202 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003203Py_ssize_t
3204PyUnicode_AsWideChar(PyObject *unicode,
3205 wchar_t *w,
3206 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003207{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003208 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003209
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003210 if (unicode == NULL) {
3211 PyErr_BadInternalCall();
3212 return -1;
3213 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003214 if (!PyUnicode_Check(unicode)) {
3215 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003216 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003217 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003218
3219 res = unicode_get_widechar_size(unicode);
3220 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003221 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003222 }
3223
3224 if (size > res) {
3225 size = res + 1;
3226 }
3227 else {
3228 res = size;
3229 }
3230 unicode_copy_as_widechar(unicode, w, size);
3231 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003232}
3233
Victor Stinner137c34c2010-09-29 10:25:54 +00003234wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003235PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003236 Py_ssize_t *size)
3237{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003238 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003239 Py_ssize_t buflen;
3240
3241 if (unicode == NULL) {
3242 PyErr_BadInternalCall();
3243 return NULL;
3244 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003245 if (!PyUnicode_Check(unicode)) {
3246 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003247 return NULL;
3248 }
3249
Serhiy Storchakac46db922018-10-23 22:58:24 +03003250 buflen = unicode_get_widechar_size(unicode);
3251 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003252 if (buffer == NULL) {
3253 PyErr_NoMemory();
3254 return NULL;
3255 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003256 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3257 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003258 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003259 }
3260 else if (wcslen(buffer) != (size_t)buflen) {
3261 PyMem_FREE(buffer);
3262 PyErr_SetString(PyExc_ValueError,
3263 "embedded null character");
3264 return NULL;
3265 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003266 return buffer;
3267}
3268
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003269#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003270
Alexander Belopolsky40018472011-02-26 01:02:56 +00003271PyObject *
3272PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003273{
Victor Stinner8faf8212011-12-08 22:14:11 +01003274 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003275 PyErr_SetString(PyExc_ValueError,
3276 "chr() arg not in range(0x110000)");
3277 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003278 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003279
Victor Stinner985a82a2014-01-03 12:53:47 +01003280 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003281}
3282
Alexander Belopolsky40018472011-02-26 01:02:56 +00003283PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003284PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003285{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003286 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003287 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003288 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003289 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003290 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003291 Py_INCREF(obj);
3292 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003293 }
3294 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003295 /* For a Unicode subtype that's not a Unicode object,
3296 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003297 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003298 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003299 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003300 "Can't convert '%.100s' object to str implicitly",
3301 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003302 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003303}
3304
Alexander Belopolsky40018472011-02-26 01:02:56 +00003305PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003306PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003307 const char *encoding,
3308 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003309{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003310 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003311 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003312
Guido van Rossumd57fd912000-03-10 22:53:23 +00003313 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003314 PyErr_BadInternalCall();
3315 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003317
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003318 /* Decoding bytes objects is the most common case and should be fast */
3319 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003320 if (PyBytes_GET_SIZE(obj) == 0) {
3321 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3322 return NULL;
3323 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003324 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003325 }
3326 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003327 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3328 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003329 }
3330
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003331 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003332 PyErr_SetString(PyExc_TypeError,
3333 "decoding str is not supported");
3334 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003335 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003336
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003337 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3338 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3339 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003340 "decoding to str: need a bytes-like object, %.80s found",
3341 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003342 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003343 }
Tim Petersced69f82003-09-16 20:30:58 +00003344
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003345 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003346 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003347 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3348 return NULL;
3349 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003350 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003351 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003352
Serhiy Storchaka05997252013-01-26 12:14:02 +02003353 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003354 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003355 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356}
3357
Victor Stinnerebe17e02016-10-12 13:57:45 +02003358/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3359 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3360 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003361int
3362_Py_normalize_encoding(const char *encoding,
3363 char *lower,
3364 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003365{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003366 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003367 char *l;
3368 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003369 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003370
Victor Stinner942889a2016-09-05 15:40:10 -07003371 assert(encoding != NULL);
3372
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003373 e = encoding;
3374 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003375 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003376 punct = 0;
3377 while (1) {
3378 char c = *e;
3379 if (c == 0) {
3380 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003381 }
Victor Stinner942889a2016-09-05 15:40:10 -07003382
3383 if (Py_ISALNUM(c) || c == '.') {
3384 if (punct && l != lower) {
3385 if (l == l_end) {
3386 return 0;
3387 }
3388 *l++ = '_';
3389 }
3390 punct = 0;
3391
3392 if (l == l_end) {
3393 return 0;
3394 }
3395 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003396 }
3397 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003398 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003399 }
Victor Stinner942889a2016-09-05 15:40:10 -07003400
3401 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003402 }
3403 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003404 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003405}
3406
Alexander Belopolsky40018472011-02-26 01:02:56 +00003407PyObject *
3408PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003409 Py_ssize_t size,
3410 const char *encoding,
3411 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003412{
3413 PyObject *buffer = NULL, *unicode;
3414 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003415 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3416
Victor Stinner22eb6892019-06-26 00:51:05 +02003417 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3418 return NULL;
3419 }
3420
Victor Stinnered076ed2019-06-26 01:49:32 +02003421 if (size == 0) {
3422 _Py_RETURN_UNICODE_EMPTY();
3423 }
3424
Victor Stinner942889a2016-09-05 15:40:10 -07003425 if (encoding == NULL) {
3426 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3427 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003428
Fred Drakee4315f52000-05-09 19:53:39 +00003429 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003430 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3431 char *lower = buflower;
3432
3433 /* Fast paths */
3434 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3435 lower += 3;
3436 if (*lower == '_') {
3437 /* Match "utf8" and "utf_8" */
3438 lower++;
3439 }
3440
3441 if (lower[0] == '8' && lower[1] == 0) {
3442 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3443 }
3444 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3445 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3446 }
3447 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3448 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3449 }
3450 }
3451 else {
3452 if (strcmp(lower, "ascii") == 0
3453 || strcmp(lower, "us_ascii") == 0) {
3454 return PyUnicode_DecodeASCII(s, size, errors);
3455 }
Steve Dowercc16be82016-09-08 10:35:16 -07003456 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003457 else if (strcmp(lower, "mbcs") == 0) {
3458 return PyUnicode_DecodeMBCS(s, size, errors);
3459 }
3460 #endif
3461 else if (strcmp(lower, "latin1") == 0
3462 || strcmp(lower, "latin_1") == 0
3463 || strcmp(lower, "iso_8859_1") == 0
3464 || strcmp(lower, "iso8859_1") == 0) {
3465 return PyUnicode_DecodeLatin1(s, size, errors);
3466 }
3467 }
Victor Stinner37296e82010-06-10 13:36:23 +00003468 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469
3470 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003471 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003472 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003473 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003474 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003475 if (buffer == NULL)
3476 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003477 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003478 if (unicode == NULL)
3479 goto onError;
3480 if (!PyUnicode_Check(unicode)) {
3481 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003482 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003483 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003484 encoding,
3485 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003486 Py_DECREF(unicode);
3487 goto onError;
3488 }
3489 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003490 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003491
Benjamin Peterson29060642009-01-31 22:14:21 +00003492 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003493 Py_XDECREF(buffer);
3494 return NULL;
3495}
3496
Alexander Belopolsky40018472011-02-26 01:02:56 +00003497PyObject *
3498PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003499 const char *encoding,
3500 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003501{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003502 if (!PyUnicode_Check(unicode)) {
3503 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003504 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003505 }
3506
Serhiy Storchaka00939072016-10-27 21:05:49 +03003507 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3508 "PyUnicode_AsDecodedObject() is deprecated; "
3509 "use PyCodec_Decode() to decode from str", 1) < 0)
3510 return NULL;
3511
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003512 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003513 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003514
3515 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003516 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003517}
3518
Alexander Belopolsky40018472011-02-26 01:02:56 +00003519PyObject *
3520PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003521 const char *encoding,
3522 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003523{
3524 PyObject *v;
3525
3526 if (!PyUnicode_Check(unicode)) {
3527 PyErr_BadArgument();
3528 goto onError;
3529 }
3530
Serhiy Storchaka00939072016-10-27 21:05:49 +03003531 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3532 "PyUnicode_AsDecodedUnicode() is deprecated; "
3533 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3534 return NULL;
3535
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003536 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003537 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003538
3539 /* Decode via the codec registry */
3540 v = PyCodec_Decode(unicode, encoding, errors);
3541 if (v == NULL)
3542 goto onError;
3543 if (!PyUnicode_Check(v)) {
3544 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003545 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003546 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003547 encoding,
3548 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003549 Py_DECREF(v);
3550 goto onError;
3551 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003552 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003553
Benjamin Peterson29060642009-01-31 22:14:21 +00003554 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003555 return NULL;
3556}
3557
Alexander Belopolsky40018472011-02-26 01:02:56 +00003558PyObject *
3559PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003560 Py_ssize_t size,
3561 const char *encoding,
3562 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563{
3564 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003565
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003566 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003567 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003568 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003569 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3570 Py_DECREF(unicode);
3571 return v;
3572}
3573
Alexander Belopolsky40018472011-02-26 01:02:56 +00003574PyObject *
3575PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003576 const char *encoding,
3577 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003578{
3579 PyObject *v;
3580
3581 if (!PyUnicode_Check(unicode)) {
3582 PyErr_BadArgument();
3583 goto onError;
3584 }
3585
Serhiy Storchaka00939072016-10-27 21:05:49 +03003586 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3587 "PyUnicode_AsEncodedObject() is deprecated; "
3588 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3589 "or PyCodec_Encode() for generic encoding", 1) < 0)
3590 return NULL;
3591
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003592 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003593 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003594
3595 /* Encode via the codec registry */
3596 v = PyCodec_Encode(unicode, encoding, errors);
3597 if (v == NULL)
3598 goto onError;
3599 return v;
3600
Benjamin Peterson29060642009-01-31 22:14:21 +00003601 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003602 return NULL;
3603}
3604
Victor Stinner1b579672011-12-17 05:47:23 +01003605
Victor Stinner2cba6b82018-01-10 22:46:15 +01003606static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003607unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003608 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003609{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003610 Py_ssize_t wlen;
3611 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3612 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003613 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003614 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003615
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003616 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003617 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003618 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003619 return NULL;
3620 }
3621
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003622 char *str;
3623 size_t error_pos;
3624 const char *reason;
3625 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003626 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003627 PyMem_Free(wstr);
3628
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003629 if (res != 0) {
3630 if (res == -2) {
3631 PyObject *exc;
3632 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3633 "locale", unicode,
3634 (Py_ssize_t)error_pos,
3635 (Py_ssize_t)(error_pos+1),
3636 reason);
3637 if (exc != NULL) {
3638 PyCodec_StrictErrors(exc);
3639 Py_DECREF(exc);
3640 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003641 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003642 else if (res == -3) {
3643 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3644 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003645 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003646 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003647 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003648 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003649 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003650
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003651 PyObject *bytes = PyBytes_FromString(str);
3652 PyMem_RawFree(str);
3653 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003654}
3655
Victor Stinnerad158722010-10-27 00:25:46 +00003656PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003657PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3658{
Victor Stinner709d23d2019-05-02 14:56:30 -04003659 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3660 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003661}
3662
3663PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003664PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003665{
Victor Stinner81a7be32020-04-14 15:14:01 +02003666 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003667 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3668 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003669 return unicode_encode_utf8(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003670 fs_codec->error_handler,
3671 fs_codec->errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003672 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003673#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003674 else if (fs_codec->encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003675 return PyUnicode_AsEncodedString(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003676 fs_codec->encoding,
3677 fs_codec->errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003678 }
Victor Stinnerad158722010-10-27 00:25:46 +00003679#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003680 else {
3681 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3682 machinery is not ready and so cannot be used:
3683 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003684 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3685 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003686 assert(filesystem_errors != NULL);
3687 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3688 assert(errors != _Py_ERROR_UNKNOWN);
3689#ifdef _Py_FORCE_UTF8_FS_ENCODING
3690 return unicode_encode_utf8(unicode, errors, NULL);
3691#else
3692 return unicode_encode_locale(unicode, errors, 0);
3693#endif
3694 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003695}
3696
Alexander Belopolsky40018472011-02-26 01:02:56 +00003697PyObject *
3698PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003699 const char *encoding,
3700 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701{
3702 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003703 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003704
Guido van Rossumd57fd912000-03-10 22:53:23 +00003705 if (!PyUnicode_Check(unicode)) {
3706 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003707 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003708 }
Fred Drakee4315f52000-05-09 19:53:39 +00003709
Victor Stinner22eb6892019-06-26 00:51:05 +02003710 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3711 return NULL;
3712 }
3713
Victor Stinner942889a2016-09-05 15:40:10 -07003714 if (encoding == NULL) {
3715 return _PyUnicode_AsUTF8String(unicode, errors);
3716 }
3717
Fred Drakee4315f52000-05-09 19:53:39 +00003718 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003719 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3720 char *lower = buflower;
3721
3722 /* Fast paths */
3723 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3724 lower += 3;
3725 if (*lower == '_') {
3726 /* Match "utf8" and "utf_8" */
3727 lower++;
3728 }
3729
3730 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003731 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003732 }
3733 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3734 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3735 }
3736 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3737 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3738 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003739 }
Victor Stinner942889a2016-09-05 15:40:10 -07003740 else {
3741 if (strcmp(lower, "ascii") == 0
3742 || strcmp(lower, "us_ascii") == 0) {
3743 return _PyUnicode_AsASCIIString(unicode, errors);
3744 }
Steve Dowercc16be82016-09-08 10:35:16 -07003745#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003746 else if (strcmp(lower, "mbcs") == 0) {
3747 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3748 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003749#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003750 else if (strcmp(lower, "latin1") == 0 ||
3751 strcmp(lower, "latin_1") == 0 ||
3752 strcmp(lower, "iso_8859_1") == 0 ||
3753 strcmp(lower, "iso8859_1") == 0) {
3754 return _PyUnicode_AsLatin1String(unicode, errors);
3755 }
3756 }
Victor Stinner37296e82010-06-10 13:36:23 +00003757 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003758
3759 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003760 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003762 return NULL;
3763
3764 /* The normal path */
3765 if (PyBytes_Check(v))
3766 return v;
3767
3768 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003769 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003770 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003771 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003772
3773 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003774 "encoder %s returned bytearray instead of bytes; "
3775 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003776 encoding);
3777 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003778 Py_DECREF(v);
3779 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003780 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003781
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003782 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3783 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003784 Py_DECREF(v);
3785 return b;
3786 }
3787
3788 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003789 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003790 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003791 encoding,
3792 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003793 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003794 return NULL;
3795}
3796
Alexander Belopolsky40018472011-02-26 01:02:56 +00003797PyObject *
3798PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003799 const char *encoding,
3800 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003801{
3802 PyObject *v;
3803
3804 if (!PyUnicode_Check(unicode)) {
3805 PyErr_BadArgument();
3806 goto onError;
3807 }
3808
Serhiy Storchaka00939072016-10-27 21:05:49 +03003809 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3810 "PyUnicode_AsEncodedUnicode() is deprecated; "
3811 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3812 return NULL;
3813
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003814 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003815 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003816
3817 /* Encode via the codec registry */
3818 v = PyCodec_Encode(unicode, encoding, errors);
3819 if (v == NULL)
3820 goto onError;
3821 if (!PyUnicode_Check(v)) {
3822 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003823 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003824 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003825 encoding,
3826 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003827 Py_DECREF(v);
3828 goto onError;
3829 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003831
Benjamin Peterson29060642009-01-31 22:14:21 +00003832 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003833 return NULL;
3834}
3835
Victor Stinner2cba6b82018-01-10 22:46:15 +01003836static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003837unicode_decode_locale(const char *str, Py_ssize_t len,
3838 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003839{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003840 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3841 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003842 return NULL;
3843 }
3844
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003845 wchar_t *wstr;
3846 size_t wlen;
3847 const char *reason;
3848 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003849 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003850 if (res != 0) {
3851 if (res == -2) {
3852 PyObject *exc;
3853 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3854 "locale", str, len,
3855 (Py_ssize_t)wlen,
3856 (Py_ssize_t)(wlen + 1),
3857 reason);
3858 if (exc != NULL) {
3859 PyCodec_StrictErrors(exc);
3860 Py_DECREF(exc);
3861 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003862 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003863 else if (res == -3) {
3864 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3865 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003866 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003867 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003868 }
Victor Stinner2f197072011-12-17 07:08:30 +01003869 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003870 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003871
3872 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3873 PyMem_RawFree(wstr);
3874 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003875}
3876
3877PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003878PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3879 const char *errors)
3880{
Victor Stinner709d23d2019-05-02 14:56:30 -04003881 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3882 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003883}
3884
3885PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003886PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003887{
3888 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003889 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3890 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003891}
3892
3893
3894PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003895PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003896 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003897 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3898}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003899
Christian Heimes5894ba72007-11-04 11:43:14 +00003900PyObject*
3901PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3902{
Victor Stinner81a7be32020-04-14 15:14:01 +02003903 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003904 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3905 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003906 return unicode_decode_utf8(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02003907 fs_codec->error_handler,
3908 fs_codec->errors,
Victor Stinner709d23d2019-05-02 14:56:30 -04003909 NULL);
3910 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003911#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003912 else if (fs_codec->encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003913 return PyUnicode_Decode(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02003914 fs_codec->encoding,
3915 fs_codec->errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003916 }
Victor Stinnerad158722010-10-27 00:25:46 +00003917#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003918 else {
3919 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3920 machinery is not ready and so cannot be used:
3921 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003922 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3923 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003924 assert(filesystem_errors != NULL);
3925 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3926 assert(errors != _Py_ERROR_UNKNOWN);
3927#ifdef _Py_FORCE_UTF8_FS_ENCODING
3928 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3929#else
3930 return unicode_decode_locale(s, size, errors, 0);
3931#endif
3932 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003933}
3934
Martin v. Löwis011e8422009-05-05 04:43:17 +00003935
3936int
3937PyUnicode_FSConverter(PyObject* arg, void* addr)
3938{
Brett Cannonec6ce872016-09-06 15:50:29 -07003939 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003940 PyObject *output = NULL;
3941 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03003942 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003943 if (arg == NULL) {
3944 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003945 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003946 return 1;
3947 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003948 path = PyOS_FSPath(arg);
3949 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003950 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003951 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003952 if (PyBytes_Check(path)) {
3953 output = path;
3954 }
3955 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3956 output = PyUnicode_EncodeFSDefault(path);
3957 Py_DECREF(path);
3958 if (!output) {
3959 return 0;
3960 }
3961 assert(PyBytes_Check(output));
3962 }
3963
Victor Stinner0ea2a462010-04-30 00:22:08 +00003964 size = PyBytes_GET_SIZE(output);
3965 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003966 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003967 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003968 Py_DECREF(output);
3969 return 0;
3970 }
3971 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003972 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003973}
3974
3975
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003976int
3977PyUnicode_FSDecoder(PyObject* arg, void* addr)
3978{
Brett Cannona5711202016-09-06 19:36:01 -07003979 int is_buffer = 0;
3980 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003981 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003982 if (arg == NULL) {
3983 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003984 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003985 return 1;
3986 }
Brett Cannona5711202016-09-06 19:36:01 -07003987
3988 is_buffer = PyObject_CheckBuffer(arg);
3989 if (!is_buffer) {
3990 path = PyOS_FSPath(arg);
3991 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003992 return 0;
3993 }
Brett Cannona5711202016-09-06 19:36:01 -07003994 }
3995 else {
3996 path = arg;
3997 Py_INCREF(arg);
3998 }
3999
4000 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07004001 output = path;
4002 }
4003 else if (PyBytes_Check(path) || is_buffer) {
4004 PyObject *path_bytes = NULL;
4005
4006 if (!PyBytes_Check(path) &&
4007 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02004008 "path should be string, bytes, or os.PathLike, not %.200s",
4009 Py_TYPE(arg)->tp_name)) {
4010 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004011 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07004012 }
4013 path_bytes = PyBytes_FromObject(path);
4014 Py_DECREF(path);
4015 if (!path_bytes) {
4016 return 0;
4017 }
4018 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4019 PyBytes_GET_SIZE(path_bytes));
4020 Py_DECREF(path_bytes);
4021 if (!output) {
4022 return 0;
4023 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004024 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004025 else {
4026 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02004027 "path should be string, bytes, or os.PathLike, not %.200s",
4028 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07004029 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004030 return 0;
4031 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004032 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02004033 Py_DECREF(output);
4034 return 0;
4035 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004036 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02004037 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004038 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004039 Py_DECREF(output);
4040 return 0;
4041 }
4042 *(PyObject**)addr = output;
4043 return Py_CLEANUP_SUPPORTED;
4044}
4045
4046
Inada Naoki02a4d572020-02-27 13:48:59 +09004047static int unicode_fill_utf8(PyObject *unicode);
4048
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004049const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004050PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004051{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004052 if (!PyUnicode_Check(unicode)) {
4053 PyErr_BadArgument();
4054 return NULL;
4055 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004056 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004057 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004058
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004059 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004060 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004061 return NULL;
4062 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004063 }
4064
4065 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004066 *psize = PyUnicode_UTF8_LENGTH(unicode);
4067 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004068}
4069
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004070const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004071PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004072{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004073 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4074}
4075
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004076Py_UNICODE *
4077PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4078{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004079 if (!PyUnicode_Check(unicode)) {
4080 PyErr_BadArgument();
4081 return NULL;
4082 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004083 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4084 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004085 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004086 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004087 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004088
Serhiy Storchakac46db922018-10-23 22:58:24 +03004089 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4090 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4091 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004092 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004093 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004094 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4095 if (w == NULL) {
4096 PyErr_NoMemory();
4097 return NULL;
4098 }
4099 unicode_copy_as_widechar(unicode, w, wlen + 1);
4100 _PyUnicode_WSTR(unicode) = w;
4101 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4102 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004103 }
4104 }
4105 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004106 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004107 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004108}
4109
Inada Naoki2c4928d2020-06-17 20:09:44 +09004110/* Deprecated APIs */
4111
4112_Py_COMP_DIAG_PUSH
4113_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4114
Alexander Belopolsky40018472011-02-26 01:02:56 +00004115Py_UNICODE *
4116PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004118 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004119}
4120
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004121const Py_UNICODE *
4122_PyUnicode_AsUnicode(PyObject *unicode)
4123{
4124 Py_ssize_t size;
4125 const Py_UNICODE *wstr;
4126
4127 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4128 if (wstr && wcslen(wstr) != (size_t)size) {
4129 PyErr_SetString(PyExc_ValueError, "embedded null character");
4130 return NULL;
4131 }
4132 return wstr;
4133}
4134
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004135
Alexander Belopolsky40018472011-02-26 01:02:56 +00004136Py_ssize_t
4137PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138{
4139 if (!PyUnicode_Check(unicode)) {
4140 PyErr_BadArgument();
4141 goto onError;
4142 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004143 if (_PyUnicode_WSTR(unicode) == NULL) {
4144 if (PyUnicode_AsUnicode(unicode) == NULL)
4145 goto onError;
4146 }
4147 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004148
Benjamin Peterson29060642009-01-31 22:14:21 +00004149 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150 return -1;
4151}
4152
Inada Naoki2c4928d2020-06-17 20:09:44 +09004153_Py_COMP_DIAG_POP
4154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004155Py_ssize_t
4156PyUnicode_GetLength(PyObject *unicode)
4157{
Victor Stinner07621332012-06-16 04:53:46 +02004158 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004159 PyErr_BadArgument();
4160 return -1;
4161 }
Victor Stinner07621332012-06-16 04:53:46 +02004162 if (PyUnicode_READY(unicode) == -1)
4163 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004164 return PyUnicode_GET_LENGTH(unicode);
4165}
4166
4167Py_UCS4
4168PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4169{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004170 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004171 int kind;
4172
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004173 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004174 PyErr_BadArgument();
4175 return (Py_UCS4)-1;
4176 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004177 if (PyUnicode_READY(unicode) == -1) {
4178 return (Py_UCS4)-1;
4179 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004180 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004181 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004182 return (Py_UCS4)-1;
4183 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004184 data = PyUnicode_DATA(unicode);
4185 kind = PyUnicode_KIND(unicode);
4186 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004187}
4188
4189int
4190PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4191{
4192 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004193 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004194 return -1;
4195 }
Victor Stinner488fa492011-12-12 00:01:39 +01004196 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004197 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004198 PyErr_SetString(PyExc_IndexError, "string index out of range");
4199 return -1;
4200 }
Victor Stinner488fa492011-12-12 00:01:39 +01004201 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004202 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004203 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4204 PyErr_SetString(PyExc_ValueError, "character out of range");
4205 return -1;
4206 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004207 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4208 index, ch);
4209 return 0;
4210}
4211
Alexander Belopolsky40018472011-02-26 01:02:56 +00004212const char *
4213PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004214{
Victor Stinner42cb4622010-09-01 19:39:01 +00004215 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004216}
4217
Victor Stinner554f3f02010-06-16 23:33:54 +00004218/* create or adjust a UnicodeDecodeError */
4219static void
4220make_decode_exception(PyObject **exceptionObject,
4221 const char *encoding,
4222 const char *input, Py_ssize_t length,
4223 Py_ssize_t startpos, Py_ssize_t endpos,
4224 const char *reason)
4225{
4226 if (*exceptionObject == NULL) {
4227 *exceptionObject = PyUnicodeDecodeError_Create(
4228 encoding, input, length, startpos, endpos, reason);
4229 }
4230 else {
4231 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4232 goto onError;
4233 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4234 goto onError;
4235 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4236 goto onError;
4237 }
4238 return;
4239
4240onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004241 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004242}
4243
Steve Dowercc16be82016-09-08 10:35:16 -07004244#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004245static int
4246widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4247{
4248 if (newsize > *size) {
4249 wchar_t *newbuf = *buf;
4250 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4251 PyErr_NoMemory();
4252 return -1;
4253 }
4254 *buf = newbuf;
4255 }
4256 *size = newsize;
4257 return 0;
4258}
4259
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004260/* error handling callback helper:
4261 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004262 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004263 and adjust various state variables.
4264 return 0 on success, -1 on error
4265*/
4266
Alexander Belopolsky40018472011-02-26 01:02:56 +00004267static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004268unicode_decode_call_errorhandler_wchar(
4269 const char *errors, PyObject **errorHandler,
4270 const char *encoding, const char *reason,
4271 const char **input, const char **inend, Py_ssize_t *startinpos,
4272 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004273 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004274{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004275 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004276
4277 PyObject *restuple = NULL;
4278 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004279 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004280 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004281 Py_ssize_t requiredsize;
4282 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004283 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004284 wchar_t *repwstr;
4285 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286
4287 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004288 *errorHandler = PyCodec_LookupError(errors);
4289 if (*errorHandler == NULL)
4290 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004291 }
4292
Victor Stinner554f3f02010-06-16 23:33:54 +00004293 make_decode_exception(exceptionObject,
4294 encoding,
4295 *input, *inend - *input,
4296 *startinpos, *endinpos,
4297 reason);
4298 if (*exceptionObject == NULL)
4299 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004300
Petr Viktorinffd97532020-02-11 17:46:57 +01004301 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004302 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004303 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004304 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004305 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004306 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004307 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004308 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004309 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004310
4311 /* Copy back the bytes variables, which might have been modified by the
4312 callback */
4313 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4314 if (!inputobj)
4315 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004316 *input = PyBytes_AS_STRING(inputobj);
4317 insize = PyBytes_GET_SIZE(inputobj);
4318 *inend = *input + insize;
4319 /* we can DECREF safely, as the exception has another reference,
4320 so the object won't go away. */
4321 Py_DECREF(inputobj);
4322
4323 if (newpos<0)
4324 newpos = insize+newpos;
4325 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004326 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004327 goto onError;
4328 }
4329
4330 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4331 if (repwstr == NULL)
4332 goto onError;
4333 /* need more space? (at least enough for what we
4334 have+the replacement+the rest of the string (starting
4335 at the new input position), so we won't have to check space
4336 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004337 requiredsize = *outpos;
4338 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4339 goto overflow;
4340 requiredsize += repwlen;
4341 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4342 goto overflow;
4343 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004344 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004345 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004346 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004347 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004348 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004349 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004350 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004351 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004352 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004353 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004354 *endinpos = newpos;
4355 *inptr = *input + newpos;
4356
4357 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004358 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004359 return 0;
4360
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004361 overflow:
4362 PyErr_SetString(PyExc_OverflowError,
4363 "decoded result is too long for a Python string");
4364
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004365 onError:
4366 Py_XDECREF(restuple);
4367 return -1;
4368}
Steve Dowercc16be82016-09-08 10:35:16 -07004369#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004370
4371static int
4372unicode_decode_call_errorhandler_writer(
4373 const char *errors, PyObject **errorHandler,
4374 const char *encoding, const char *reason,
4375 const char **input, const char **inend, Py_ssize_t *startinpos,
4376 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4377 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4378{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004379 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004380
4381 PyObject *restuple = NULL;
4382 PyObject *repunicode = NULL;
4383 Py_ssize_t insize;
4384 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004385 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004386 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004387 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004388 int need_to_grow = 0;
4389 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004390
4391 if (*errorHandler == NULL) {
4392 *errorHandler = PyCodec_LookupError(errors);
4393 if (*errorHandler == NULL)
4394 goto onError;
4395 }
4396
4397 make_decode_exception(exceptionObject,
4398 encoding,
4399 *input, *inend - *input,
4400 *startinpos, *endinpos,
4401 reason);
4402 if (*exceptionObject == NULL)
4403 goto onError;
4404
Petr Viktorinffd97532020-02-11 17:46:57 +01004405 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004406 if (restuple == NULL)
4407 goto onError;
4408 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004409 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004410 goto onError;
4411 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004412 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004413 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004414
4415 /* Copy back the bytes variables, which might have been modified by the
4416 callback */
4417 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4418 if (!inputobj)
4419 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004420 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004421 *input = PyBytes_AS_STRING(inputobj);
4422 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004423 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004424 /* we can DECREF safely, as the exception has another reference,
4425 so the object won't go away. */
4426 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004427
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004428 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004429 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004430 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004431 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004432 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004433 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434
Victor Stinner170ca6f2013-04-18 00:25:28 +02004435 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004436 if (replen > 1) {
4437 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004438 need_to_grow = 1;
4439 }
4440 new_inptr = *input + newpos;
4441 if (*inend - new_inptr > remain) {
4442 /* We don't know the decoding algorithm here so we make the worst
4443 assumption that one byte decodes to one unicode character.
4444 If unfortunately one byte could decode to more unicode characters,
4445 the decoder may write out-of-bound then. Is it possible for the
4446 algorithms using this function? */
4447 writer->min_length += *inend - new_inptr - remain;
4448 need_to_grow = 1;
4449 }
4450 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004451 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004452 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004453 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4454 goto onError;
4455 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004456 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004457 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004458
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004460 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004461
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004462 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004463 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004464 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465
Benjamin Peterson29060642009-01-31 22:14:21 +00004466 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004467 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004468 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004469}
4470
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004471/* --- UTF-7 Codec -------------------------------------------------------- */
4472
Antoine Pitrou244651a2009-05-04 18:56:13 +00004473/* See RFC2152 for details. We encode conservatively and decode liberally. */
4474
4475/* Three simple macros defining base-64. */
4476
4477/* Is c a base-64 character? */
4478
4479#define IS_BASE64(c) \
4480 (((c) >= 'A' && (c) <= 'Z') || \
4481 ((c) >= 'a' && (c) <= 'z') || \
4482 ((c) >= '0' && (c) <= '9') || \
4483 (c) == '+' || (c) == '/')
4484
4485/* given that c is a base-64 character, what is its base-64 value? */
4486
4487#define FROM_BASE64(c) \
4488 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4489 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4490 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4491 (c) == '+' ? 62 : 63)
4492
4493/* What is the base-64 character of the bottom 6 bits of n? */
4494
4495#define TO_BASE64(n) \
4496 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4497
4498/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4499 * decoded as itself. We are permissive on decoding; the only ASCII
4500 * byte not decoding to itself is the + which begins a base64
4501 * string. */
4502
4503#define DECODE_DIRECT(c) \
4504 ((c) <= 127 && (c) != '+')
4505
4506/* The UTF-7 encoder treats ASCII characters differently according to
4507 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4508 * the above). See RFC2152. This array identifies these different
4509 * sets:
4510 * 0 : "Set D"
4511 * alphanumeric and '(),-./:?
4512 * 1 : "Set O"
4513 * !"#$%&*;<=>@[]^_`{|}
4514 * 2 : "whitespace"
4515 * ht nl cr sp
4516 * 3 : special (must be base64 encoded)
4517 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4518 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004519
Tim Petersced69f82003-09-16 20:30:58 +00004520static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004521char utf7_category[128] = {
4522/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4523 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4524/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4525 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4526/* sp ! " # $ % & ' ( ) * + , - . / */
4527 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4528/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4529 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4530/* @ A B C D E F G H I J K L M N O */
4531 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4532/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4533 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4534/* ` a b c d e f g h i j k l m n o */
4535 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4536/* p q r s t u v w x y z { | } ~ del */
4537 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004538};
4539
Antoine Pitrou244651a2009-05-04 18:56:13 +00004540/* ENCODE_DIRECT: this character should be encoded as itself. The
4541 * answer depends on whether we are encoding set O as itself, and also
4542 * on whether we are encoding whitespace as itself. RFC2152 makes it
4543 * clear that the answers to these questions vary between
4544 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004545
Antoine Pitrou244651a2009-05-04 18:56:13 +00004546#define ENCODE_DIRECT(c, directO, directWS) \
4547 ((c) < 128 && (c) > 0 && \
4548 ((utf7_category[(c)] == 0) || \
4549 (directWS && (utf7_category[(c)] == 2)) || \
4550 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004551
Alexander Belopolsky40018472011-02-26 01:02:56 +00004552PyObject *
4553PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004554 Py_ssize_t size,
4555 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004556{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004557 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4558}
4559
Antoine Pitrou244651a2009-05-04 18:56:13 +00004560/* The decoder. The only state we preserve is our read position,
4561 * i.e. how many characters we have consumed. So if we end in the
4562 * middle of a shift sequence we have to back off the read position
4563 * and the output to the beginning of the sequence, otherwise we lose
4564 * all the shift state (seen bits, number of bits seen, high
4565 * surrogate). */
4566
Alexander Belopolsky40018472011-02-26 01:02:56 +00004567PyObject *
4568PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004569 Py_ssize_t size,
4570 const char *errors,
4571 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004572{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004573 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004574 Py_ssize_t startinpos;
4575 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004576 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004577 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004578 const char *errmsg = "";
4579 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004580 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581 unsigned int base64bits = 0;
4582 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004583 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004584 PyObject *errorHandler = NULL;
4585 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004586
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004587 if (size == 0) {
4588 if (consumed)
4589 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004590 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004591 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004592
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004593 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004594 _PyUnicodeWriter_Init(&writer);
4595 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004596
4597 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004598 e = s + size;
4599
4600 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004601 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004602 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004603 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004604
Antoine Pitrou244651a2009-05-04 18:56:13 +00004605 if (inShift) { /* in a base-64 section */
4606 if (IS_BASE64(ch)) { /* consume a base-64 character */
4607 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4608 base64bits += 6;
4609 s++;
4610 if (base64bits >= 16) {
4611 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004612 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004613 base64bits -= 16;
4614 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004615 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004616 if (surrogate) {
4617 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004618 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4619 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004620 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004621 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004622 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004623 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004624 }
4625 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004626 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004627 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004628 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004629 }
4630 }
Victor Stinner551ac952011-11-29 22:58:13 +01004631 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004632 /* first surrogate */
4633 surrogate = outCh;
4634 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004635 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004636 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004637 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004638 }
4639 }
4640 }
4641 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004642 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004643 if (base64bits > 0) { /* left-over bits */
4644 if (base64bits >= 6) {
4645 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004646 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004647 errmsg = "partial character in shift sequence";
4648 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004649 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004650 else {
4651 /* Some bits remain; they should be zero */
4652 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004653 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004654 errmsg = "non-zero padding bits in shift sequence";
4655 goto utf7Error;
4656 }
4657 }
4658 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004659 if (surrogate && DECODE_DIRECT(ch)) {
4660 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4661 goto onError;
4662 }
4663 surrogate = 0;
4664 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004665 /* '-' is absorbed; other terminating
4666 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004667 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004668 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004669 }
4670 }
4671 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004672 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004673 s++; /* consume '+' */
4674 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004675 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004676 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004677 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004678 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004679 else if (s < e && !IS_BASE64(*s)) {
4680 s++;
4681 errmsg = "ill-formed sequence";
4682 goto utf7Error;
4683 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004684 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004685 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004686 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004687 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004688 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004689 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004690 }
4691 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004692 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004693 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004694 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004695 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004696 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004697 else {
4698 startinpos = s-starts;
4699 s++;
4700 errmsg = "unexpected special character";
4701 goto utf7Error;
4702 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004703 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004704utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004705 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004706 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004707 errors, &errorHandler,
4708 "utf7", errmsg,
4709 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004710 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004711 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004712 }
4713
Antoine Pitrou244651a2009-05-04 18:56:13 +00004714 /* end of string */
4715
4716 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4717 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004718 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004719 if (surrogate ||
4720 (base64bits >= 6) ||
4721 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004722 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004723 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004724 errors, &errorHandler,
4725 "utf7", "unterminated shift sequence",
4726 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004727 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004728 goto onError;
4729 if (s < e)
4730 goto restart;
4731 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004732 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004733
4734 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004735 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004736 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004737 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004738 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004739 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004740 writer.kind, writer.data, shiftOutStart);
4741 Py_XDECREF(errorHandler);
4742 Py_XDECREF(exc);
4743 _PyUnicodeWriter_Dealloc(&writer);
4744 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004745 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004746 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004747 }
4748 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004749 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004750 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004751 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004752
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004753 Py_XDECREF(errorHandler);
4754 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004755 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004756
Benjamin Peterson29060642009-01-31 22:14:21 +00004757 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004758 Py_XDECREF(errorHandler);
4759 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004760 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004761 return NULL;
4762}
4763
4764
Alexander Belopolsky40018472011-02-26 01:02:56 +00004765PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004766_PyUnicode_EncodeUTF7(PyObject *str,
4767 int base64SetO,
4768 int base64WhiteSpace,
4769 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004770{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004771 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004772 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004773 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004774 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004775 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004776 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004777 unsigned int base64bits = 0;
4778 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004779 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004780 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004781
Benjamin Petersonbac79492012-01-14 13:34:47 -05004782 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004783 return NULL;
4784 kind = PyUnicode_KIND(str);
4785 data = PyUnicode_DATA(str);
4786 len = PyUnicode_GET_LENGTH(str);
4787
4788 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004789 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004790
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004791 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004792 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004793 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004794 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004795 if (v == NULL)
4796 return NULL;
4797
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004798 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004799 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004800 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004801
Antoine Pitrou244651a2009-05-04 18:56:13 +00004802 if (inShift) {
4803 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4804 /* shifting out */
4805 if (base64bits) { /* output remaining bits */
4806 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4807 base64buffer = 0;
4808 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004809 }
4810 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004811 /* Characters not in the BASE64 set implicitly unshift the sequence
4812 so no '-' is required, except if the character is itself a '-' */
4813 if (IS_BASE64(ch) || ch == '-') {
4814 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004815 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004816 *out++ = (char) ch;
4817 }
4818 else {
4819 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004820 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004821 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004822 else { /* not in a shift sequence */
4823 if (ch == '+') {
4824 *out++ = '+';
4825 *out++ = '-';
4826 }
4827 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4828 *out++ = (char) ch;
4829 }
4830 else {
4831 *out++ = '+';
4832 inShift = 1;
4833 goto encode_char;
4834 }
4835 }
4836 continue;
4837encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004838 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004839 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004840
Antoine Pitrou244651a2009-05-04 18:56:13 +00004841 /* code first surrogate */
4842 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004843 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004844 while (base64bits >= 6) {
4845 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4846 base64bits -= 6;
4847 }
4848 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004849 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004850 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004851 base64bits += 16;
4852 base64buffer = (base64buffer << 16) | ch;
4853 while (base64bits >= 6) {
4854 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4855 base64bits -= 6;
4856 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004857 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004858 if (base64bits)
4859 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4860 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004861 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004862 if (_PyBytes_Resize(&v, out - start) < 0)
4863 return NULL;
4864 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004865}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004866PyObject *
4867PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4868 Py_ssize_t size,
4869 int base64SetO,
4870 int base64WhiteSpace,
4871 const char *errors)
4872{
4873 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004874 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004875 if (tmp == NULL)
4876 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004877 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004878 base64WhiteSpace, errors);
4879 Py_DECREF(tmp);
4880 return result;
4881}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004882
Antoine Pitrou244651a2009-05-04 18:56:13 +00004883#undef IS_BASE64
4884#undef FROM_BASE64
4885#undef TO_BASE64
4886#undef DECODE_DIRECT
4887#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004888
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889/* --- UTF-8 Codec -------------------------------------------------------- */
4890
Alexander Belopolsky40018472011-02-26 01:02:56 +00004891PyObject *
4892PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004893 Py_ssize_t size,
4894 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004895{
Walter Dörwald69652032004-09-07 20:24:22 +00004896 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4897}
4898
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004899#include "stringlib/asciilib.h"
4900#include "stringlib/codecs.h"
4901#include "stringlib/undef.h"
4902
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004903#include "stringlib/ucs1lib.h"
4904#include "stringlib/codecs.h"
4905#include "stringlib/undef.h"
4906
4907#include "stringlib/ucs2lib.h"
4908#include "stringlib/codecs.h"
4909#include "stringlib/undef.h"
4910
4911#include "stringlib/ucs4lib.h"
4912#include "stringlib/codecs.h"
4913#include "stringlib/undef.h"
4914
Antoine Pitrouab868312009-01-10 15:40:25 +00004915/* Mask to quickly check whether a C 'long' contains a
4916 non-ASCII, UTF8-encoded char. */
4917#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004918# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004919#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004920# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004921#else
4922# error C 'long' size should be either 4 or 8!
4923#endif
4924
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004925static Py_ssize_t
4926ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004927{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004928 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004929 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004930
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004931 /*
4932 * Issue #17237: m68k is a bit different from most architectures in
4933 * that objects do not use "natural alignment" - for example, int and
4934 * long are only aligned at 2-byte boundaries. Therefore the assert()
4935 * won't work; also, tests have shown that skipping the "optimised
4936 * version" will even speed up m68k.
4937 */
4938#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004939#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004940 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4941 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004942 /* Fast path, see in STRINGLIB(utf8_decode) for
4943 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004944 /* Help allocation */
4945 const char *_p = p;
4946 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004947 while (_p < aligned_end) {
4948 unsigned long value = *(const unsigned long *) _p;
4949 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004950 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004951 *((unsigned long *)q) = value;
4952 _p += SIZEOF_LONG;
4953 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004954 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004955 p = _p;
4956 while (p < end) {
4957 if ((unsigned char)*p & 0x80)
4958 break;
4959 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004960 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004961 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004962 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004963#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004964#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004965 while (p < end) {
4966 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4967 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004968 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004969 /* Help allocation */
4970 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004971 while (_p < aligned_end) {
Andy Lestere6be9b52020-02-11 20:28:35 -06004972 unsigned long value = *(const unsigned long *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004973 if (value & ASCII_CHAR_MASK)
4974 break;
4975 _p += SIZEOF_LONG;
4976 }
4977 p = _p;
4978 if (_p == end)
4979 break;
4980 }
4981 if ((unsigned char)*p & 0x80)
4982 break;
4983 ++p;
4984 }
4985 memcpy(dest, start, p - start);
4986 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004987}
Antoine Pitrouab868312009-01-10 15:40:25 +00004988
Victor Stinner709d23d2019-05-02 14:56:30 -04004989static PyObject *
4990unicode_decode_utf8(const char *s, Py_ssize_t size,
4991 _Py_error_handler error_handler, const char *errors,
4992 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004993{
Victor Stinner785938e2011-12-11 20:09:03 +01004994 if (size == 0) {
4995 if (consumed)
4996 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004997 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004998 }
4999
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005000 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5001 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01005002 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005003 *consumed = 1;
5004 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01005005 }
5006
Inada Naoki770847a2019-06-24 12:30:24 +09005007 const char *starts = s;
5008 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01005009
Inada Naoki770847a2019-06-24 12:30:24 +09005010 // fast path: try ASCII string.
5011 PyObject *u = PyUnicode_New(size, 127);
5012 if (u == NULL) {
5013 return NULL;
5014 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005015 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09005016 if (s == end) {
5017 return u;
5018 }
5019
5020 // Use _PyUnicodeWriter after fast path is failed.
5021 _PyUnicodeWriter writer;
5022 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5023 writer.pos = s - starts;
5024
5025 Py_ssize_t startinpos, endinpos;
5026 const char *errmsg = "";
5027 PyObject *error_handler_obj = NULL;
5028 PyObject *exc = NULL;
5029
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005030 while (s < end) {
5031 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005032 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005033
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005034 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005035 if (PyUnicode_IS_ASCII(writer.buffer))
5036 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005037 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005038 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005039 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005040 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005041 } else {
5042 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005043 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005044 }
5045
5046 switch (ch) {
5047 case 0:
5048 if (s == end || consumed)
5049 goto End;
5050 errmsg = "unexpected end of data";
5051 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005052 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005053 break;
5054 case 1:
5055 errmsg = "invalid start byte";
5056 startinpos = s - starts;
5057 endinpos = startinpos + 1;
5058 break;
5059 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005060 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5061 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5062 {
5063 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005064 goto End;
5065 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005066 /* fall through */
5067 case 3:
5068 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005069 errmsg = "invalid continuation byte";
5070 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005071 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005072 break;
5073 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005074 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005075 goto onError;
5076 continue;
5077 }
5078
Victor Stinner1d65d912015-10-05 13:43:50 +02005079 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005080 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005081
5082 switch (error_handler) {
5083 case _Py_ERROR_IGNORE:
5084 s += (endinpos - startinpos);
5085 break;
5086
5087 case _Py_ERROR_REPLACE:
5088 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5089 goto onError;
5090 s += (endinpos - startinpos);
5091 break;
5092
5093 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005094 {
5095 Py_ssize_t i;
5096
Victor Stinner1d65d912015-10-05 13:43:50 +02005097 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5098 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005099 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005100 ch = (Py_UCS4)(unsigned char)(starts[i]);
5101 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5102 ch + 0xdc00);
5103 writer.pos++;
5104 }
5105 s += (endinpos - startinpos);
5106 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005107 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005108
5109 default:
5110 if (unicode_decode_call_errorhandler_writer(
5111 errors, &error_handler_obj,
5112 "utf-8", errmsg,
5113 &starts, &end, &startinpos, &endinpos, &exc, &s,
5114 &writer))
5115 goto onError;
5116 }
Victor Stinner785938e2011-12-11 20:09:03 +01005117 }
5118
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005119End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005120 if (consumed)
5121 *consumed = s - starts;
5122
Victor Stinner1d65d912015-10-05 13:43:50 +02005123 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005124 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005125 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005126
5127onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005128 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005129 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005130 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005131 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005132}
5133
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005134
Victor Stinner709d23d2019-05-02 14:56:30 -04005135PyObject *
5136PyUnicode_DecodeUTF8Stateful(const char *s,
5137 Py_ssize_t size,
5138 const char *errors,
5139 Py_ssize_t *consumed)
5140{
5141 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5142}
5143
5144
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005145/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5146 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005147
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005148 On success, write a pointer to a newly allocated wide character string into
5149 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5150 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005151
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005152 On memory allocation failure, return -1.
5153
5154 On decoding error (if surrogateescape is zero), return -2. If wlen is
5155 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5156 is not NULL, write the decoding error message into *reason. */
5157int
5158_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005159 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005160{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005161 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005162 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005163 wchar_t *unicode;
5164 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005165
Victor Stinner3d4226a2018-08-29 22:21:32 +02005166 int surrogateescape = 0;
5167 int surrogatepass = 0;
5168 switch (errors)
5169 {
5170 case _Py_ERROR_STRICT:
5171 break;
5172 case _Py_ERROR_SURROGATEESCAPE:
5173 surrogateescape = 1;
5174 break;
5175 case _Py_ERROR_SURROGATEPASS:
5176 surrogatepass = 1;
5177 break;
5178 default:
5179 return -3;
5180 }
5181
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005182 /* Note: size will always be longer than the resulting Unicode
5183 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005184 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005185 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005186 }
5187
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005188 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005189 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005190 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005191 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005192
5193 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005194 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005195 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005196 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005197 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005198#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005199 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005200#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005201 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005202#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005203 if (ch > 0xFF) {
5204#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005205 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005206#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005207 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005208 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005209 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5210 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5211#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005212 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005213 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005214 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005215 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005216 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005217
5218 if (surrogateescape) {
5219 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5220 }
5221 else {
5222 /* Is it a valid three-byte code? */
5223 if (surrogatepass
5224 && (e - s) >= 3
5225 && (s[0] & 0xf0) == 0xe0
5226 && (s[1] & 0xc0) == 0x80
5227 && (s[2] & 0xc0) == 0x80)
5228 {
5229 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5230 s += 3;
5231 unicode[outpos++] = ch;
5232 }
5233 else {
5234 PyMem_RawFree(unicode );
5235 if (reason != NULL) {
5236 switch (ch) {
5237 case 0:
5238 *reason = "unexpected end of data";
5239 break;
5240 case 1:
5241 *reason = "invalid start byte";
5242 break;
5243 /* 2, 3, 4 */
5244 default:
5245 *reason = "invalid continuation byte";
5246 break;
5247 }
5248 }
5249 if (wlen != NULL) {
5250 *wlen = s - orig_s;
5251 }
5252 return -2;
5253 }
5254 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005255 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005256 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005257 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005258 if (wlen) {
5259 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005260 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005261 *wstr = unicode;
5262 return 0;
5263}
5264
Victor Stinner5f9cf232019-03-19 01:46:25 +01005265
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005266wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005267_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5268 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005269{
5270 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005271 int res = _Py_DecodeUTF8Ex(arg, arglen,
5272 &wstr, wlen,
5273 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005274 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005275 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5276 assert(res != -3);
5277 if (wlen) {
5278 *wlen = (size_t)res;
5279 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005280 return NULL;
5281 }
5282 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005283}
5284
Antoine Pitrouab868312009-01-10 15:40:25 +00005285
Victor Stinnere47e6982017-12-21 15:45:16 +01005286/* UTF-8 encoder using the surrogateescape error handler .
5287
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005288 On success, return 0 and write the newly allocated character string (use
5289 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005290
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005291 On encoding failure, return -2 and write the position of the invalid
5292 surrogate character into *error_pos (if error_pos is set) and the decoding
5293 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005294
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005295 On memory allocation failure, return -1. */
5296int
5297_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005298 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005299{
5300 const Py_ssize_t max_char_size = 4;
5301 Py_ssize_t len = wcslen(text);
5302
5303 assert(len >= 0);
5304
Victor Stinner3d4226a2018-08-29 22:21:32 +02005305 int surrogateescape = 0;
5306 int surrogatepass = 0;
5307 switch (errors)
5308 {
5309 case _Py_ERROR_STRICT:
5310 break;
5311 case _Py_ERROR_SURROGATEESCAPE:
5312 surrogateescape = 1;
5313 break;
5314 case _Py_ERROR_SURROGATEPASS:
5315 surrogatepass = 1;
5316 break;
5317 default:
5318 return -3;
5319 }
5320
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005321 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5322 return -1;
5323 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005324 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005325 if (raw_malloc) {
5326 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005327 }
5328 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005329 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005330 }
5331 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005332 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005333 }
5334
5335 char *p = bytes;
5336 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005337 for (i = 0; i < len; ) {
5338 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005339 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005340 i++;
5341#if Py_UNICODE_SIZE == 2
5342 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5343 && i < len
5344 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5345 {
5346 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5347 i++;
5348 }
5349#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005350
5351 if (ch < 0x80) {
5352 /* Encode ASCII */
5353 *p++ = (char) ch;
5354
5355 }
5356 else if (ch < 0x0800) {
5357 /* Encode Latin-1 */
5358 *p++ = (char)(0xc0 | (ch >> 6));
5359 *p++ = (char)(0x80 | (ch & 0x3f));
5360 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005361 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005362 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005363 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005364 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005365 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005366 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005367 if (reason != NULL) {
5368 *reason = "encoding error";
5369 }
5370 if (raw_malloc) {
5371 PyMem_RawFree(bytes);
5372 }
5373 else {
5374 PyMem_Free(bytes);
5375 }
5376 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005377 }
5378 *p++ = (char)(ch & 0xff);
5379 }
5380 else if (ch < 0x10000) {
5381 *p++ = (char)(0xe0 | (ch >> 12));
5382 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5383 *p++ = (char)(0x80 | (ch & 0x3f));
5384 }
5385 else { /* ch >= 0x10000 */
5386 assert(ch <= MAX_UNICODE);
5387 /* Encode UCS4 Unicode ordinals */
5388 *p++ = (char)(0xf0 | (ch >> 18));
5389 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5390 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5391 *p++ = (char)(0x80 | (ch & 0x3f));
5392 }
5393 }
5394 *p++ = '\0';
5395
5396 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005397 char *bytes2;
5398 if (raw_malloc) {
5399 bytes2 = PyMem_RawRealloc(bytes, final_size);
5400 }
5401 else {
5402 bytes2 = PyMem_Realloc(bytes, final_size);
5403 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005404 if (bytes2 == NULL) {
5405 if (error_pos != NULL) {
5406 *error_pos = (size_t)-1;
5407 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005408 if (raw_malloc) {
5409 PyMem_RawFree(bytes);
5410 }
5411 else {
5412 PyMem_Free(bytes);
5413 }
5414 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005415 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005416 *str = bytes2;
5417 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005418}
5419
5420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005421/* Primary internal function which creates utf8 encoded bytes objects.
5422
5423 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005424 and allocate exactly as much space needed at the end. Else allocate the
5425 maximum possible needed (4 result bytes per Unicode character), and return
5426 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005427*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005428static PyObject *
5429unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5430 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005432 if (!PyUnicode_Check(unicode)) {
5433 PyErr_BadArgument();
5434 return NULL;
5435 }
5436
5437 if (PyUnicode_READY(unicode) == -1)
5438 return NULL;
5439
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005440 if (PyUnicode_UTF8(unicode))
5441 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5442 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005443
Inada Naoki02a4d572020-02-27 13:48:59 +09005444 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005445 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005446 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5447
5448 _PyBytesWriter writer;
5449 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005450
Benjamin Petersonead6b532011-12-20 17:23:42 -06005451 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005452 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005453 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005454 case PyUnicode_1BYTE_KIND:
5455 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5456 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005457 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5458 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005459 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005460 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5461 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005462 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005463 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5464 break;
Tim Peters602f7402002-04-27 18:03:26 +00005465 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005466
5467 if (end == NULL) {
5468 _PyBytesWriter_Dealloc(&writer);
5469 return NULL;
5470 }
5471 return _PyBytesWriter_Finish(&writer, end);
5472}
5473
5474static int
5475unicode_fill_utf8(PyObject *unicode)
5476{
5477 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5478 assert(!PyUnicode_IS_ASCII(unicode));
5479
5480 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005481 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005482 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5483
5484 _PyBytesWriter writer;
5485 char *end;
5486
5487 switch (kind) {
5488 default:
5489 Py_UNREACHABLE();
5490 case PyUnicode_1BYTE_KIND:
5491 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5492 _Py_ERROR_STRICT, NULL);
5493 break;
5494 case PyUnicode_2BYTE_KIND:
5495 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5496 _Py_ERROR_STRICT, NULL);
5497 break;
5498 case PyUnicode_4BYTE_KIND:
5499 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5500 _Py_ERROR_STRICT, NULL);
5501 break;
5502 }
5503 if (end == NULL) {
5504 _PyBytesWriter_Dealloc(&writer);
5505 return -1;
5506 }
5507
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005508 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005509 PyBytes_AS_STRING(writer.buffer);
5510 Py_ssize_t len = end - start;
5511
5512 char *cache = PyObject_MALLOC(len + 1);
5513 if (cache == NULL) {
5514 _PyBytesWriter_Dealloc(&writer);
5515 PyErr_NoMemory();
5516 return -1;
5517 }
5518 _PyUnicode_UTF8(unicode) = cache;
5519 _PyUnicode_UTF8_LENGTH(unicode) = len;
5520 memcpy(cache, start, len);
5521 cache[len] = '\0';
5522 _PyBytesWriter_Dealloc(&writer);
5523 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005524}
5525
Alexander Belopolsky40018472011-02-26 01:02:56 +00005526PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005527_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5528{
5529 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5530}
5531
5532
5533PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005534PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5535 Py_ssize_t size,
5536 const char *errors)
5537{
5538 PyObject *v, *unicode;
5539
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005540 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005541 if (unicode == NULL)
5542 return NULL;
5543 v = _PyUnicode_AsUTF8String(unicode, errors);
5544 Py_DECREF(unicode);
5545 return v;
5546}
5547
5548PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005549PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005551 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552}
5553
Walter Dörwald41980ca2007-08-16 21:55:45 +00005554/* --- UTF-32 Codec ------------------------------------------------------- */
5555
5556PyObject *
5557PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005558 Py_ssize_t size,
5559 const char *errors,
5560 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005561{
5562 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5563}
5564
5565PyObject *
5566PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005567 Py_ssize_t size,
5568 const char *errors,
5569 int *byteorder,
5570 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005571{
5572 const char *starts = s;
5573 Py_ssize_t startinpos;
5574 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005575 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005576 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005577 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005578 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005579 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005580 PyObject *errorHandler = NULL;
5581 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005582
Andy Lestere6be9b52020-02-11 20:28:35 -06005583 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005584 e = q + size;
5585
5586 if (byteorder)
5587 bo = *byteorder;
5588
5589 /* Check for BOM marks (U+FEFF) in the input and adjust current
5590 byte order setting accordingly. In native mode, the leading BOM
5591 mark is skipped, in all other modes, it is copied to the output
5592 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005593 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005594 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005595 if (bom == 0x0000FEFF) {
5596 bo = -1;
5597 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005598 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005599 else if (bom == 0xFFFE0000) {
5600 bo = 1;
5601 q += 4;
5602 }
5603 if (byteorder)
5604 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005605 }
5606
Victor Stinnere64322e2012-10-30 23:12:47 +01005607 if (q == e) {
5608 if (consumed)
5609 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005610 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005611 }
5612
Victor Stinnere64322e2012-10-30 23:12:47 +01005613#ifdef WORDS_BIGENDIAN
5614 le = bo < 0;
5615#else
5616 le = bo <= 0;
5617#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005618 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005619
Victor Stinner8f674cc2013-04-17 23:02:17 +02005620 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005621 writer.min_length = (e - q + 3) / 4;
5622 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005623 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005624
Victor Stinnere64322e2012-10-30 23:12:47 +01005625 while (1) {
5626 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005627 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005628
Victor Stinnere64322e2012-10-30 23:12:47 +01005629 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005630 enum PyUnicode_Kind kind = writer.kind;
5631 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005632 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005633 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005634 if (le) {
5635 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005636 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005637 if (ch > maxch)
5638 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005639 if (kind != PyUnicode_1BYTE_KIND &&
5640 Py_UNICODE_IS_SURROGATE(ch))
5641 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005642 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005643 q += 4;
5644 } while (q <= last);
5645 }
5646 else {
5647 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005648 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005649 if (ch > maxch)
5650 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005651 if (kind != PyUnicode_1BYTE_KIND &&
5652 Py_UNICODE_IS_SURROGATE(ch))
5653 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005654 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005655 q += 4;
5656 } while (q <= last);
5657 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005658 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005659 }
5660
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005661 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005662 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005663 startinpos = ((const char *)q) - starts;
5664 endinpos = startinpos + 4;
5665 }
5666 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005667 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005668 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005669 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005670 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005671 startinpos = ((const char *)q) - starts;
5672 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005673 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005674 else {
5675 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005676 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005677 goto onError;
5678 q += 4;
5679 continue;
5680 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005681 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005682 startinpos = ((const char *)q) - starts;
5683 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005684 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005685
5686 /* The remaining input chars are ignored if the callback
5687 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005688 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005689 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005690 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005691 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005692 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005693 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005694 }
5695
Walter Dörwald41980ca2007-08-16 21:55:45 +00005696 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005697 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005698
Walter Dörwald41980ca2007-08-16 21:55:45 +00005699 Py_XDECREF(errorHandler);
5700 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005701 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005702
Benjamin Peterson29060642009-01-31 22:14:21 +00005703 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005704 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005705 Py_XDECREF(errorHandler);
5706 Py_XDECREF(exc);
5707 return NULL;
5708}
5709
5710PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005711_PyUnicode_EncodeUTF32(PyObject *str,
5712 const char *errors,
5713 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005714{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005715 enum PyUnicode_Kind kind;
5716 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005717 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005718 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005719 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005720#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005721 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005722#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005723 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005724#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005725 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005726 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005727 PyObject *errorHandler = NULL;
5728 PyObject *exc = NULL;
5729 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005730
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005731 if (!PyUnicode_Check(str)) {
5732 PyErr_BadArgument();
5733 return NULL;
5734 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005735 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005736 return NULL;
5737 kind = PyUnicode_KIND(str);
5738 data = PyUnicode_DATA(str);
5739 len = PyUnicode_GET_LENGTH(str);
5740
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005741 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005742 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005743 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005744 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005745 if (v == NULL)
5746 return NULL;
5747
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005748 /* output buffer is 4-bytes aligned */
5749 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005750 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005751 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005752 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005753 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005754 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005755
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005756 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005757 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005758 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005759 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005760 else
5761 encoding = "utf-32";
5762
5763 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005764 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5765 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005766 }
5767
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005768 pos = 0;
5769 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005770 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005771
5772 if (kind == PyUnicode_2BYTE_KIND) {
5773 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5774 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005775 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005776 else {
5777 assert(kind == PyUnicode_4BYTE_KIND);
5778 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5779 &out, native_ordering);
5780 }
5781 if (pos == len)
5782 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005783
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005784 rep = unicode_encode_call_errorhandler(
5785 errors, &errorHandler,
5786 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005787 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005788 if (!rep)
5789 goto error;
5790
5791 if (PyBytes_Check(rep)) {
5792 repsize = PyBytes_GET_SIZE(rep);
5793 if (repsize & 3) {
5794 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005795 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005796 "surrogates not allowed");
5797 goto error;
5798 }
5799 moreunits = repsize / 4;
5800 }
5801 else {
5802 assert(PyUnicode_Check(rep));
5803 if (PyUnicode_READY(rep) < 0)
5804 goto error;
5805 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5806 if (!PyUnicode_IS_ASCII(rep)) {
5807 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005808 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005809 "surrogates not allowed");
5810 goto error;
5811 }
5812 }
5813
5814 /* four bytes are reserved for each surrogate */
5815 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005816 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005817 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005818 /* integer overflow */
5819 PyErr_NoMemory();
5820 goto error;
5821 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005822 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005823 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005824 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005825 }
5826
5827 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005828 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005829 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005830 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005831 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005832 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5833 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005834 }
5835
5836 Py_CLEAR(rep);
5837 }
5838
5839 /* Cut back to size actually needed. This is necessary for, for example,
5840 encoding of a string containing isolated surrogates and the 'ignore'
5841 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005842 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005843 if (nsize != PyBytes_GET_SIZE(v))
5844 _PyBytes_Resize(&v, nsize);
5845 Py_XDECREF(errorHandler);
5846 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005847 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005848 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005849 error:
5850 Py_XDECREF(rep);
5851 Py_XDECREF(errorHandler);
5852 Py_XDECREF(exc);
5853 Py_XDECREF(v);
5854 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005855}
5856
Alexander Belopolsky40018472011-02-26 01:02:56 +00005857PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005858PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5859 Py_ssize_t size,
5860 const char *errors,
5861 int byteorder)
5862{
5863 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005864 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005865 if (tmp == NULL)
5866 return NULL;
5867 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5868 Py_DECREF(tmp);
5869 return result;
5870}
5871
5872PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005873PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005874{
Victor Stinnerb960b342011-11-20 19:12:52 +01005875 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005876}
5877
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878/* --- UTF-16 Codec ------------------------------------------------------- */
5879
Tim Peters772747b2001-08-09 22:21:55 +00005880PyObject *
5881PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005882 Py_ssize_t size,
5883 const char *errors,
5884 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005885{
Walter Dörwald69652032004-09-07 20:24:22 +00005886 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5887}
5888
5889PyObject *
5890PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005891 Py_ssize_t size,
5892 const char *errors,
5893 int *byteorder,
5894 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005895{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005896 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005897 Py_ssize_t startinpos;
5898 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005899 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005900 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005901 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005902 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005903 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005904 PyObject *errorHandler = NULL;
5905 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005906 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907
Andy Lestere6be9b52020-02-11 20:28:35 -06005908 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005909 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910
5911 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005912 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005914 /* Check for BOM marks (U+FEFF) in the input and adjust current
5915 byte order setting accordingly. In native mode, the leading BOM
5916 mark is skipped, in all other modes, it is copied to the output
5917 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005918 if (bo == 0 && size >= 2) {
5919 const Py_UCS4 bom = (q[1] << 8) | q[0];
5920 if (bom == 0xFEFF) {
5921 q += 2;
5922 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005923 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005924 else if (bom == 0xFFFE) {
5925 q += 2;
5926 bo = 1;
5927 }
5928 if (byteorder)
5929 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005930 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931
Antoine Pitrou63065d72012-05-15 23:48:04 +02005932 if (q == e) {
5933 if (consumed)
5934 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005935 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005936 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005937
Christian Heimes743e0cd2012-10-17 23:52:17 +02005938#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005939 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005940 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005941#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005942 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005943 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005944#endif
Tim Peters772747b2001-08-09 22:21:55 +00005945
Antoine Pitrou63065d72012-05-15 23:48:04 +02005946 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005947 character count normally. Error handler will take care of
5948 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005949 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005950 writer.min_length = (e - q + 1) / 2;
5951 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005952 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005953
Antoine Pitrou63065d72012-05-15 23:48:04 +02005954 while (1) {
5955 Py_UCS4 ch = 0;
5956 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005957 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005958 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005959 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005960 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005961 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005962 native_ordering);
5963 else
5964 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005965 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005966 native_ordering);
5967 } else if (kind == PyUnicode_2BYTE_KIND) {
5968 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005969 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005970 native_ordering);
5971 } else {
5972 assert(kind == PyUnicode_4BYTE_KIND);
5973 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005974 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005975 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005976 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005977 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005978
Antoine Pitrou63065d72012-05-15 23:48:04 +02005979 switch (ch)
5980 {
5981 case 0:
5982 /* remaining byte at the end? (size should be even) */
5983 if (q == e || consumed)
5984 goto End;
5985 errmsg = "truncated data";
5986 startinpos = ((const char *)q) - starts;
5987 endinpos = ((const char *)e) - starts;
5988 break;
5989 /* The remaining input chars are ignored if the callback
5990 chooses to skip the input */
5991 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005992 q -= 2;
5993 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005994 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005995 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005996 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005997 endinpos = ((const char *)e) - starts;
5998 break;
5999 case 2:
6000 errmsg = "illegal encoding";
6001 startinpos = ((const char *)q) - 2 - starts;
6002 endinpos = startinpos + 2;
6003 break;
6004 case 3:
6005 errmsg = "illegal UTF-16 surrogate";
6006 startinpos = ((const char *)q) - 4 - starts;
6007 endinpos = startinpos + 2;
6008 break;
6009 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006010 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006011 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006012 continue;
6013 }
6014
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006015 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00006016 errors,
6017 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006018 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00006019 &starts,
6020 (const char **)&e,
6021 &startinpos,
6022 &endinpos,
6023 &exc,
6024 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006025 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006026 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027 }
6028
Antoine Pitrou63065d72012-05-15 23:48:04 +02006029End:
Walter Dörwald69652032004-09-07 20:24:22 +00006030 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006031 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00006032
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006033 Py_XDECREF(errorHandler);
6034 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006035 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036
Benjamin Peterson29060642009-01-31 22:14:21 +00006037 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006038 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006039 Py_XDECREF(errorHandler);
6040 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 return NULL;
6042}
6043
Tim Peters772747b2001-08-09 22:21:55 +00006044PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006045_PyUnicode_EncodeUTF16(PyObject *str,
6046 const char *errors,
6047 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006049 enum PyUnicode_Kind kind;
6050 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006051 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006052 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006053 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006054 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02006055#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006056 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006057#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006058 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006059#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006060 const char *encoding;
6061 Py_ssize_t nsize, pos;
6062 PyObject *errorHandler = NULL;
6063 PyObject *exc = NULL;
6064 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006065
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006066 if (!PyUnicode_Check(str)) {
6067 PyErr_BadArgument();
6068 return NULL;
6069 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006070 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006071 return NULL;
6072 kind = PyUnicode_KIND(str);
6073 data = PyUnicode_DATA(str);
6074 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006075
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006076 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006077 if (kind == PyUnicode_4BYTE_KIND) {
6078 const Py_UCS4 *in = (const Py_UCS4 *)data;
6079 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006080 while (in < end) {
6081 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006082 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006083 }
6084 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006085 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006086 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006087 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006088 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006089 nsize = len + pairs + (byteorder == 0);
6090 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006091 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006093 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006095 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006096 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006097 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006098 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006099 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006100 }
6101 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006102 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006103 }
Tim Peters772747b2001-08-09 22:21:55 +00006104
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006105 if (kind == PyUnicode_1BYTE_KIND) {
6106 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6107 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006108 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006109
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006110 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006111 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006112 }
6113 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006114 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006115 }
6116 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006117 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006118 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006119
6120 pos = 0;
6121 while (pos < len) {
6122 Py_ssize_t repsize, moreunits;
6123
6124 if (kind == PyUnicode_2BYTE_KIND) {
6125 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6126 &out, native_ordering);
6127 }
6128 else {
6129 assert(kind == PyUnicode_4BYTE_KIND);
6130 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6131 &out, native_ordering);
6132 }
6133 if (pos == len)
6134 break;
6135
6136 rep = unicode_encode_call_errorhandler(
6137 errors, &errorHandler,
6138 encoding, "surrogates not allowed",
6139 str, &exc, pos, pos + 1, &pos);
6140 if (!rep)
6141 goto error;
6142
6143 if (PyBytes_Check(rep)) {
6144 repsize = PyBytes_GET_SIZE(rep);
6145 if (repsize & 1) {
6146 raise_encode_exception(&exc, encoding,
6147 str, pos - 1, pos,
6148 "surrogates not allowed");
6149 goto error;
6150 }
6151 moreunits = repsize / 2;
6152 }
6153 else {
6154 assert(PyUnicode_Check(rep));
6155 if (PyUnicode_READY(rep) < 0)
6156 goto error;
6157 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6158 if (!PyUnicode_IS_ASCII(rep)) {
6159 raise_encode_exception(&exc, encoding,
6160 str, pos - 1, pos,
6161 "surrogates not allowed");
6162 goto error;
6163 }
6164 }
6165
6166 /* two bytes are reserved for each surrogate */
6167 if (moreunits > 1) {
6168 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006169 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006170 /* integer overflow */
6171 PyErr_NoMemory();
6172 goto error;
6173 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006174 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006175 goto error;
6176 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6177 }
6178
6179 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006180 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006181 out += moreunits;
6182 } else /* rep is unicode */ {
6183 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6184 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6185 &out, native_ordering);
6186 }
6187
6188 Py_CLEAR(rep);
6189 }
6190
6191 /* Cut back to size actually needed. This is necessary for, for example,
6192 encoding of a string containing isolated surrogates and the 'ignore' handler
6193 is used. */
6194 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6195 if (nsize != PyBytes_GET_SIZE(v))
6196 _PyBytes_Resize(&v, nsize);
6197 Py_XDECREF(errorHandler);
6198 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006199 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006200 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006201 error:
6202 Py_XDECREF(rep);
6203 Py_XDECREF(errorHandler);
6204 Py_XDECREF(exc);
6205 Py_XDECREF(v);
6206 return NULL;
6207#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208}
6209
Alexander Belopolsky40018472011-02-26 01:02:56 +00006210PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006211PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6212 Py_ssize_t size,
6213 const char *errors,
6214 int byteorder)
6215{
6216 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006217 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006218 if (tmp == NULL)
6219 return NULL;
6220 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6221 Py_DECREF(tmp);
6222 return result;
6223}
6224
6225PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006226PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006228 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229}
6230
6231/* --- Unicode Escape Codec ----------------------------------------------- */
6232
Fredrik Lundh06d12682001-01-24 07:59:11 +00006233static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006234
Alexander Belopolsky40018472011-02-26 01:02:56 +00006235PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006236_PyUnicode_DecodeUnicodeEscape(const char *s,
6237 Py_ssize_t size,
6238 const char *errors,
6239 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006241 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006242 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006244 PyObject *errorHandler = NULL;
6245 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006246
Eric V. Smith42454af2016-10-31 09:22:08 -04006247 // so we can remember if we've seen an invalid escape char or not
6248 *first_invalid_escape = NULL;
6249
Victor Stinner62ec3312016-09-06 17:04:34 -07006250 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006251 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006252 }
6253 /* Escaped strings will always be longer than the resulting
6254 Unicode string, so we start with size here and then reduce the
6255 length after conversion to the true value.
6256 (but if the error callback returns a long replacement string
6257 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006258 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006259 writer.min_length = size;
6260 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6261 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006262 }
6263
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264 end = s + size;
6265 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006266 unsigned char c = (unsigned char) *s++;
6267 Py_UCS4 ch;
6268 int count;
6269 Py_ssize_t startinpos;
6270 Py_ssize_t endinpos;
6271 const char *message;
6272
6273#define WRITE_ASCII_CHAR(ch) \
6274 do { \
6275 assert(ch <= 127); \
6276 assert(writer.pos < writer.size); \
6277 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6278 } while(0)
6279
6280#define WRITE_CHAR(ch) \
6281 do { \
6282 if (ch <= writer.maxchar) { \
6283 assert(writer.pos < writer.size); \
6284 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6285 } \
6286 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6287 goto onError; \
6288 } \
6289 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290
6291 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006292 if (c != '\\') {
6293 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294 continue;
6295 }
6296
Victor Stinner62ec3312016-09-06 17:04:34 -07006297 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006299 if (s >= end) {
6300 message = "\\ at end of string";
6301 goto error;
6302 }
6303 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006304
Victor Stinner62ec3312016-09-06 17:04:34 -07006305 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006306 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307
Benjamin Peterson29060642009-01-31 22:14:21 +00006308 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006309 case '\n': continue;
6310 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6311 case '\'': WRITE_ASCII_CHAR('\''); continue;
6312 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6313 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006314 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006315 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6316 case 't': WRITE_ASCII_CHAR('\t'); continue;
6317 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6318 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006319 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006320 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006321 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006322 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323
Benjamin Peterson29060642009-01-31 22:14:21 +00006324 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006325 case '0': case '1': case '2': case '3':
6326 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006327 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006328 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006329 ch = (ch<<3) + *s++ - '0';
6330 if (s < end && '0' <= *s && *s <= '7') {
6331 ch = (ch<<3) + *s++ - '0';
6332 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006334 WRITE_CHAR(ch);
6335 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336
Benjamin Peterson29060642009-01-31 22:14:21 +00006337 /* hex escapes */
6338 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006340 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006341 message = "truncated \\xXX escape";
6342 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006346 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006347 message = "truncated \\uXXXX escape";
6348 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349
Benjamin Peterson29060642009-01-31 22:14:21 +00006350 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006351 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006352 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006353 message = "truncated \\UXXXXXXXX escape";
6354 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006355 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006356 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006357 ch <<= 4;
6358 if (c >= '0' && c <= '9') {
6359 ch += c - '0';
6360 }
6361 else if (c >= 'a' && c <= 'f') {
6362 ch += c - ('a' - 10);
6363 }
6364 else if (c >= 'A' && c <= 'F') {
6365 ch += c - ('A' - 10);
6366 }
6367 else {
6368 break;
6369 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006370 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006371 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006372 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006373 }
6374
6375 /* when we get here, ch is a 32-bit unicode character */
6376 if (ch > MAX_UNICODE) {
6377 message = "illegal Unicode character";
6378 goto error;
6379 }
6380
6381 WRITE_CHAR(ch);
6382 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006383
Benjamin Peterson29060642009-01-31 22:14:21 +00006384 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006385 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006386 if (ucnhash_CAPI == NULL) {
6387 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006388 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6389 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006390 if (ucnhash_CAPI == NULL) {
6391 PyErr_SetString(
6392 PyExc_UnicodeError,
6393 "\\N escapes not supported (can't load unicodedata module)"
6394 );
6395 goto onError;
6396 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006397 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006398
6399 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006400 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006401 const char *start = ++s;
6402 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006403 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006404 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006405 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006406 namelen = s - start;
6407 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006408 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006409 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006410 ch = 0xffffffff; /* in case 'getcode' messes up */
6411 if (namelen <= INT_MAX &&
6412 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6413 &ch, 0)) {
6414 assert(ch <= MAX_UNICODE);
6415 WRITE_CHAR(ch);
6416 continue;
6417 }
6418 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006419 }
6420 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006421 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006422
6423 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006424 if (*first_invalid_escape == NULL) {
6425 *first_invalid_escape = s-1; /* Back up one char, since we've
6426 already incremented s. */
6427 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006428 WRITE_ASCII_CHAR('\\');
6429 WRITE_CHAR(c);
6430 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006432
6433 error:
6434 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006435 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006436 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006437 errors, &errorHandler,
6438 "unicodeescape", message,
6439 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006440 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006441 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006442 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006443 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006444
6445#undef WRITE_ASCII_CHAR
6446#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006448
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006449 Py_XDECREF(errorHandler);
6450 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006451 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006452
Benjamin Peterson29060642009-01-31 22:14:21 +00006453 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006454 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006455 Py_XDECREF(errorHandler);
6456 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457 return NULL;
6458}
6459
Eric V. Smith42454af2016-10-31 09:22:08 -04006460PyObject *
6461PyUnicode_DecodeUnicodeEscape(const char *s,
6462 Py_ssize_t size,
6463 const char *errors)
6464{
6465 const char *first_invalid_escape;
6466 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6467 &first_invalid_escape);
6468 if (result == NULL)
6469 return NULL;
6470 if (first_invalid_escape != NULL) {
6471 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6472 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006473 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006474 Py_DECREF(result);
6475 return NULL;
6476 }
6477 }
6478 return result;
6479}
6480
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006481/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482
Alexander Belopolsky40018472011-02-26 01:02:56 +00006483PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006484PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006486 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006487 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006489 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006490 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006491 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492
Ezio Melottie7f90372012-10-05 03:33:31 +03006493 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006494 escape.
6495
Ezio Melottie7f90372012-10-05 03:33:31 +03006496 For UCS1 strings it's '\xxx', 4 bytes per source character.
6497 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6498 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006499 */
6500
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006501 if (!PyUnicode_Check(unicode)) {
6502 PyErr_BadArgument();
6503 return NULL;
6504 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006505 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006506 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006507 }
Victor Stinner358af132015-10-12 22:36:57 +02006508
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006509 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006510 if (len == 0) {
6511 return PyBytes_FromStringAndSize(NULL, 0);
6512 }
6513
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006514 kind = PyUnicode_KIND(unicode);
6515 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006516 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6517 bytes, and 1 byte characters 4. */
6518 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006519 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006520 return PyErr_NoMemory();
6521 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006522 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006523 if (repr == NULL) {
6524 return NULL;
6525 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006526
Victor Stinner62ec3312016-09-06 17:04:34 -07006527 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006528 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006529 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006530
Victor Stinner62ec3312016-09-06 17:04:34 -07006531 /* U+0000-U+00ff range */
6532 if (ch < 0x100) {
6533 if (ch >= ' ' && ch < 127) {
6534 if (ch != '\\') {
6535 /* Copy printable US ASCII as-is */
6536 *p++ = (char) ch;
6537 }
6538 /* Escape backslashes */
6539 else {
6540 *p++ = '\\';
6541 *p++ = '\\';
6542 }
6543 }
Victor Stinner358af132015-10-12 22:36:57 +02006544
Victor Stinner62ec3312016-09-06 17:04:34 -07006545 /* Map special whitespace to '\t', \n', '\r' */
6546 else if (ch == '\t') {
6547 *p++ = '\\';
6548 *p++ = 't';
6549 }
6550 else if (ch == '\n') {
6551 *p++ = '\\';
6552 *p++ = 'n';
6553 }
6554 else if (ch == '\r') {
6555 *p++ = '\\';
6556 *p++ = 'r';
6557 }
6558
6559 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6560 else {
6561 *p++ = '\\';
6562 *p++ = 'x';
6563 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6564 *p++ = Py_hexdigits[ch & 0x000F];
6565 }
Tim Petersced69f82003-09-16 20:30:58 +00006566 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006567 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006568 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569 *p++ = '\\';
6570 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006571 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6572 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6573 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6574 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006576 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6577 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006578
Victor Stinner62ec3312016-09-06 17:04:34 -07006579 /* Make sure that the first two digits are zero */
6580 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006581 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006582 *p++ = 'U';
6583 *p++ = '0';
6584 *p++ = '0';
6585 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6586 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6587 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6588 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6589 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6590 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006591 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593
Victor Stinner62ec3312016-09-06 17:04:34 -07006594 assert(p - PyBytes_AS_STRING(repr) > 0);
6595 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6596 return NULL;
6597 }
6598 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599}
6600
Alexander Belopolsky40018472011-02-26 01:02:56 +00006601PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006602PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6603 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006605 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006606 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006607 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006609 }
6610
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006611 result = PyUnicode_AsUnicodeEscapeString(tmp);
6612 Py_DECREF(tmp);
6613 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614}
6615
6616/* --- Raw Unicode Escape Codec ------------------------------------------- */
6617
Alexander Belopolsky40018472011-02-26 01:02:56 +00006618PyObject *
6619PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006620 Py_ssize_t size,
6621 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006623 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006624 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006626 PyObject *errorHandler = NULL;
6627 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006628
Victor Stinner62ec3312016-09-06 17:04:34 -07006629 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006630 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006631 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006632
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 /* Escaped strings will always be longer than the resulting
6634 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006635 length after conversion to the true value. (But decoding error
6636 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006637 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006638 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006639 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6640 goto onError;
6641 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006642
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643 end = s + size;
6644 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006645 unsigned char c = (unsigned char) *s++;
6646 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006647 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006648 Py_ssize_t startinpos;
6649 Py_ssize_t endinpos;
6650 const char *message;
6651
6652#define WRITE_CHAR(ch) \
6653 do { \
6654 if (ch <= writer.maxchar) { \
6655 assert(writer.pos < writer.size); \
6656 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6657 } \
6658 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6659 goto onError; \
6660 } \
6661 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662
Benjamin Peterson29060642009-01-31 22:14:21 +00006663 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006664 if (c != '\\' || s >= end) {
6665 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006666 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006667 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006668
Victor Stinner62ec3312016-09-06 17:04:34 -07006669 c = (unsigned char) *s++;
6670 if (c == 'u') {
6671 count = 4;
6672 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006673 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006674 else if (c == 'U') {
6675 count = 8;
6676 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006677 }
6678 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006679 assert(writer.pos < writer.size);
6680 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6681 WRITE_CHAR(c);
6682 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006683 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006684 startinpos = s - starts - 2;
6685
6686 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6687 for (ch = 0; count && s < end; ++s, --count) {
6688 c = (unsigned char)*s;
6689 ch <<= 4;
6690 if (c >= '0' && c <= '9') {
6691 ch += c - '0';
6692 }
6693 else if (c >= 'a' && c <= 'f') {
6694 ch += c - ('a' - 10);
6695 }
6696 else if (c >= 'A' && c <= 'F') {
6697 ch += c - ('A' - 10);
6698 }
6699 else {
6700 break;
6701 }
6702 }
6703 if (!count) {
6704 if (ch <= MAX_UNICODE) {
6705 WRITE_CHAR(ch);
6706 continue;
6707 }
6708 message = "\\Uxxxxxxxx out of range";
6709 }
6710
6711 endinpos = s-starts;
6712 writer.min_length = end - s + writer.pos;
6713 if (unicode_decode_call_errorhandler_writer(
6714 errors, &errorHandler,
6715 "rawunicodeescape", message,
6716 &starts, &end, &startinpos, &endinpos, &exc, &s,
6717 &writer)) {
6718 goto onError;
6719 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006720 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006721
6722#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006724 Py_XDECREF(errorHandler);
6725 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006726 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006727
Benjamin Peterson29060642009-01-31 22:14:21 +00006728 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006729 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006730 Py_XDECREF(errorHandler);
6731 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006733
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734}
6735
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006736
Alexander Belopolsky40018472011-02-26 01:02:56 +00006737PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006738PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739{
Victor Stinner62ec3312016-09-06 17:04:34 -07006740 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006742 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006743 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006744 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006745 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006747 if (!PyUnicode_Check(unicode)) {
6748 PyErr_BadArgument();
6749 return NULL;
6750 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006751 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006752 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006753 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006754 kind = PyUnicode_KIND(unicode);
6755 data = PyUnicode_DATA(unicode);
6756 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006757 if (kind == PyUnicode_1BYTE_KIND) {
6758 return PyBytes_FromStringAndSize(data, len);
6759 }
Victor Stinner0e368262011-11-10 20:12:49 +01006760
Victor Stinner62ec3312016-09-06 17:04:34 -07006761 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6762 bytes, and 1 byte characters 4. */
6763 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006764
Victor Stinner62ec3312016-09-06 17:04:34 -07006765 if (len > PY_SSIZE_T_MAX / expandsize) {
6766 return PyErr_NoMemory();
6767 }
6768 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6769 if (repr == NULL) {
6770 return NULL;
6771 }
6772 if (len == 0) {
6773 return repr;
6774 }
6775
6776 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006777 for (pos = 0; pos < len; pos++) {
6778 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006779
Victor Stinner62ec3312016-09-06 17:04:34 -07006780 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6781 if (ch < 0x100) {
6782 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006783 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006784 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006785 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786 *p++ = '\\';
6787 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006788 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6789 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6790 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6791 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006793 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6794 else {
6795 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6796 *p++ = '\\';
6797 *p++ = 'U';
6798 *p++ = '0';
6799 *p++ = '0';
6800 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6801 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6802 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6803 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6804 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6805 *p++ = Py_hexdigits[ch & 15];
6806 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006808
Victor Stinner62ec3312016-09-06 17:04:34 -07006809 assert(p > PyBytes_AS_STRING(repr));
6810 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6811 return NULL;
6812 }
6813 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814}
6815
Alexander Belopolsky40018472011-02-26 01:02:56 +00006816PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006817PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6818 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006820 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006821 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006822 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006823 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006824 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6825 Py_DECREF(tmp);
6826 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827}
6828
6829/* --- Latin-1 Codec ------------------------------------------------------ */
6830
Alexander Belopolsky40018472011-02-26 01:02:56 +00006831PyObject *
6832PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006833 Py_ssize_t size,
6834 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006837 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838}
6839
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006840/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006841static void
6842make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006843 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006844 PyObject *unicode,
6845 Py_ssize_t startpos, Py_ssize_t endpos,
6846 const char *reason)
6847{
6848 if (*exceptionObject == NULL) {
6849 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006850 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006851 encoding, unicode, startpos, endpos, reason);
6852 }
6853 else {
6854 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6855 goto onError;
6856 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6857 goto onError;
6858 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6859 goto onError;
6860 return;
6861 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006862 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006863 }
6864}
6865
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006866/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006867static void
6868raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006869 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006870 PyObject *unicode,
6871 Py_ssize_t startpos, Py_ssize_t endpos,
6872 const char *reason)
6873{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006874 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006875 encoding, unicode, startpos, endpos, reason);
6876 if (*exceptionObject != NULL)
6877 PyCodec_StrictErrors(*exceptionObject);
6878}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006879
6880/* error handling callback helper:
6881 build arguments, call the callback and check the arguments,
6882 put the result into newpos and return the replacement string, which
6883 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006884static PyObject *
6885unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006886 PyObject **errorHandler,
6887 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006888 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006889 Py_ssize_t startpos, Py_ssize_t endpos,
6890 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006891{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006892 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006893 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006894 PyObject *restuple;
6895 PyObject *resunicode;
6896
6897 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006898 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006899 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006900 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006901 }
6902
Benjamin Petersonbac79492012-01-14 13:34:47 -05006903 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006904 return NULL;
6905 len = PyUnicode_GET_LENGTH(unicode);
6906
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006907 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006908 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006909 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006910 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006911
Petr Viktorinffd97532020-02-11 17:46:57 +01006912 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006913 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006914 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006915 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006916 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006917 Py_DECREF(restuple);
6918 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006919 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006920 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006921 &resunicode, newpos)) {
6922 Py_DECREF(restuple);
6923 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006924 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006925 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6926 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6927 Py_DECREF(restuple);
6928 return NULL;
6929 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006930 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006931 *newpos = len + *newpos;
6932 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006933 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006934 Py_DECREF(restuple);
6935 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006936 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006937 Py_INCREF(resunicode);
6938 Py_DECREF(restuple);
6939 return resunicode;
6940}
6941
Alexander Belopolsky40018472011-02-26 01:02:56 +00006942static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006943unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006944 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006945 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006946{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006947 /* input state */
6948 Py_ssize_t pos=0, size;
6949 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006950 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006951 /* pointer into the output */
6952 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006953 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6954 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006955 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006956 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006957 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006958 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006959 /* output object */
6960 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006961
Benjamin Petersonbac79492012-01-14 13:34:47 -05006962 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006963 return NULL;
6964 size = PyUnicode_GET_LENGTH(unicode);
6965 kind = PyUnicode_KIND(unicode);
6966 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006967 /* allocate enough for a simple encoding without
6968 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006969 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006970 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006971
6972 _PyBytesWriter_Init(&writer);
6973 str = _PyBytesWriter_Alloc(&writer, size);
6974 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006975 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006976
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006977 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006978 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006979
Benjamin Peterson29060642009-01-31 22:14:21 +00006980 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006981 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006982 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006983 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006984 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006985 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006986 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006987 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006988 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006989 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006990 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006991 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006992
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006993 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006994 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006995
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006996 /* Only overallocate the buffer if it's not the last write */
6997 writer.overallocate = (collend < size);
6998
Benjamin Peterson29060642009-01-31 22:14:21 +00006999 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02007000 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007001 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02007002
7003 switch (error_handler) {
7004 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007005 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007006 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02007007
7008 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02007009 memset(str, '?', collend - collstart);
7010 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02007011 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02007012 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007013 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007014 break;
Victor Stinner50149202015-09-22 00:26:54 +02007015
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007016 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007017 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007018 writer.min_size -= (collend - collstart);
7019 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007020 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007021 if (str == NULL)
7022 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007023 pos = collend;
7024 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007025
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007026 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007027 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007028 writer.min_size -= (collend - collstart);
7029 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007030 unicode, collstart, collend);
7031 if (str == NULL)
7032 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007033 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007034 break;
Victor Stinner50149202015-09-22 00:26:54 +02007035
Victor Stinnerc3713e92015-09-29 12:32:13 +02007036 case _Py_ERROR_SURROGATEESCAPE:
7037 for (i = collstart; i < collend; ++i) {
7038 ch = PyUnicode_READ(kind, data, i);
7039 if (ch < 0xdc80 || 0xdcff < ch) {
7040 /* Not a UTF-8b surrogate */
7041 break;
7042 }
7043 *str++ = (char)(ch - 0xdc00);
7044 ++pos;
7045 }
7046 if (i >= collend)
7047 break;
7048 collstart = pos;
7049 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02007050 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02007051
Benjamin Peterson29060642009-01-31 22:14:21 +00007052 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007053 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7054 encoding, reason, unicode, &exc,
7055 collstart, collend, &newpos);
7056 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007057 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007058
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007059 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007060 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007061
Victor Stinner6bd525b2015-10-09 13:10:05 +02007062 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007063 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007064 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007065 PyBytes_AS_STRING(rep),
7066 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007067 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007068 else {
7069 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007070
Victor Stinner6bd525b2015-10-09 13:10:05 +02007071 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007072 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007073
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007074 if (limit == 256 ?
7075 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7076 !PyUnicode_IS_ASCII(rep))
7077 {
7078 /* Not all characters are smaller than limit */
7079 raise_encode_exception(&exc, encoding, unicode,
7080 collstart, collend, reason);
7081 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007082 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007083 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7084 str = _PyBytesWriter_WriteBytes(&writer, str,
7085 PyUnicode_DATA(rep),
7086 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007087 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007088 if (str == NULL)
7089 goto onError;
7090
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007091 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007092 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007093 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007094
7095 /* If overallocation was disabled, ensure that it was the last
7096 write. Otherwise, we missed an optimization */
7097 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007098 }
7099 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007100
Victor Stinner50149202015-09-22 00:26:54 +02007101 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007102 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007103 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007104
7105 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007106 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007107 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007108 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007109 Py_XDECREF(exc);
7110 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007111}
7112
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007113/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007114PyObject *
7115PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007116 Py_ssize_t size,
7117 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007119 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007120 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007121 if (unicode == NULL)
7122 return NULL;
7123 result = unicode_encode_ucs1(unicode, errors, 256);
7124 Py_DECREF(unicode);
7125 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007126}
7127
Alexander Belopolsky40018472011-02-26 01:02:56 +00007128PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007129_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130{
7131 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007132 PyErr_BadArgument();
7133 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007134 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007135 if (PyUnicode_READY(unicode) == -1)
7136 return NULL;
7137 /* Fast path: if it is a one-byte string, construct
7138 bytes object directly. */
7139 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7140 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7141 PyUnicode_GET_LENGTH(unicode));
7142 /* Non-Latin-1 characters present. Defer to above function to
7143 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007144 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007145}
7146
7147PyObject*
7148PyUnicode_AsLatin1String(PyObject *unicode)
7149{
7150 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151}
7152
7153/* --- 7-bit ASCII Codec -------------------------------------------------- */
7154
Alexander Belopolsky40018472011-02-26 01:02:56 +00007155PyObject *
7156PyUnicode_DecodeASCII(const char *s,
7157 Py_ssize_t size,
7158 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007160 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007161 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007162 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007163 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007164 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007165
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007167 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007168
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02007170 if (size == 1 && (unsigned char)s[0] < 128)
7171 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007172
Inada Naoki770847a2019-06-24 12:30:24 +09007173 // Shortcut for simple case
7174 PyObject *u = PyUnicode_New(size, 127);
7175 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007176 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007177 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007178 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007179 if (outpos == size) {
7180 return u;
7181 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007182
Inada Naoki770847a2019-06-24 12:30:24 +09007183 _PyUnicodeWriter writer;
7184 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007185 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007186
Inada Naoki770847a2019-06-24 12:30:24 +09007187 s += outpos;
7188 int kind = writer.kind;
7189 void *data = writer.data;
7190 Py_ssize_t startinpos, endinpos;
7191
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007192 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007193 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007194 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007195 PyUnicode_WRITE(kind, data, writer.pos, c);
7196 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007197 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007198 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007199 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007200
7201 /* byte outsize range 0x00..0x7f: call the error handler */
7202
7203 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007204 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007205
7206 switch (error_handler)
7207 {
7208 case _Py_ERROR_REPLACE:
7209 case _Py_ERROR_SURROGATEESCAPE:
7210 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007211 but we may switch to UCS2 at the first write */
7212 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7213 goto onError;
7214 kind = writer.kind;
7215 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007216
7217 if (error_handler == _Py_ERROR_REPLACE)
7218 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7219 else
7220 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7221 writer.pos++;
7222 ++s;
7223 break;
7224
7225 case _Py_ERROR_IGNORE:
7226 ++s;
7227 break;
7228
7229 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007230 startinpos = s-starts;
7231 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007232 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007233 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007234 "ascii", "ordinal not in range(128)",
7235 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007236 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007237 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007238 kind = writer.kind;
7239 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007240 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007241 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007242 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007243 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007244 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007245
Benjamin Peterson29060642009-01-31 22:14:21 +00007246 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007247 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007248 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007249 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250 return NULL;
7251}
7252
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007253/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007254PyObject *
7255PyUnicode_EncodeASCII(const Py_UNICODE *p,
7256 Py_ssize_t size,
7257 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007258{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007259 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007260 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007261 if (unicode == NULL)
7262 return NULL;
7263 result = unicode_encode_ucs1(unicode, errors, 128);
7264 Py_DECREF(unicode);
7265 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007266}
7267
Alexander Belopolsky40018472011-02-26 01:02:56 +00007268PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007269_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270{
7271 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007272 PyErr_BadArgument();
7273 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007275 if (PyUnicode_READY(unicode) == -1)
7276 return NULL;
7277 /* Fast path: if it is an ASCII-only string, construct bytes object
7278 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007279 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007280 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7281 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007282 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007283}
7284
7285PyObject *
7286PyUnicode_AsASCIIString(PyObject *unicode)
7287{
7288 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007289}
7290
Steve Dowercc16be82016-09-08 10:35:16 -07007291#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007292
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007293/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007294
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007295#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007296#define NEED_RETRY
7297#endif
7298
Steve Dower7ebdda02019-08-21 16:22:33 -07007299/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7300 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7301 both cases also and avoids partial characters overrunning the
7302 length limit in MultiByteToWideChar on Windows */
7303#define DECODING_CHUNK_SIZE (INT_MAX/4)
7304
Victor Stinner3a50e702011-10-18 21:21:00 +02007305#ifndef WC_ERR_INVALID_CHARS
7306# define WC_ERR_INVALID_CHARS 0x0080
7307#endif
7308
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007309static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007310code_page_name(UINT code_page, PyObject **obj)
7311{
7312 *obj = NULL;
7313 if (code_page == CP_ACP)
7314 return "mbcs";
7315 if (code_page == CP_UTF7)
7316 return "CP_UTF7";
7317 if (code_page == CP_UTF8)
7318 return "CP_UTF8";
7319
7320 *obj = PyBytes_FromFormat("cp%u", code_page);
7321 if (*obj == NULL)
7322 return NULL;
7323 return PyBytes_AS_STRING(*obj);
7324}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007325
Victor Stinner3a50e702011-10-18 21:21:00 +02007326static DWORD
7327decode_code_page_flags(UINT code_page)
7328{
7329 if (code_page == CP_UTF7) {
7330 /* The CP_UTF7 decoder only supports flags=0 */
7331 return 0;
7332 }
7333 else
7334 return MB_ERR_INVALID_CHARS;
7335}
7336
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007337/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007338 * Decode a byte string from a Windows code page into unicode object in strict
7339 * mode.
7340 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007341 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7342 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007343 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007344static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007345decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007346 wchar_t **buf,
7347 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007348 const char *in,
7349 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007350{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007351 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007352 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007353 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007354
7355 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007356 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007357 while ((outsize = MultiByteToWideChar(code_page, flags,
7358 in, insize, NULL, 0)) <= 0)
7359 {
7360 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7361 goto error;
7362 }
7363 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7364 flags = 0;
7365 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007366
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007367 /* Extend a wchar_t* buffer */
7368 Py_ssize_t n = *bufsize; /* Get the current length */
7369 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7370 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007371 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007372 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007373
7374 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007375 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7376 if (outsize <= 0)
7377 goto error;
7378 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007379
Victor Stinner3a50e702011-10-18 21:21:00 +02007380error:
7381 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7382 return -2;
7383 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007384 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007385}
7386
Victor Stinner3a50e702011-10-18 21:21:00 +02007387/*
7388 * Decode a byte string from a code page into unicode object with an error
7389 * handler.
7390 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007391 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007392 * UnicodeDecodeError exception and returns -1 on error.
7393 */
7394static int
7395decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007396 wchar_t **buf,
7397 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007398 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007399 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007400{
7401 const char *startin = in;
7402 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007403 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007404 /* Ideally, we should get reason from FormatMessage. This is the Windows
7405 2000 English version of the message. */
7406 const char *reason = "No mapping for the Unicode character exists "
7407 "in the target code page.";
7408 /* each step cannot decode more than 1 character, but a character can be
7409 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007410 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007411 int insize;
7412 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007413 PyObject *errorHandler = NULL;
7414 PyObject *exc = NULL;
7415 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007416 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007417 DWORD err;
7418 int ret = -1;
7419
7420 assert(size > 0);
7421
7422 encoding = code_page_name(code_page, &encoding_obj);
7423 if (encoding == NULL)
7424 return -1;
7425
Victor Stinner7d00cc12014-03-17 23:08:06 +01007426 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007427 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7428 UnicodeDecodeError. */
7429 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7430 if (exc != NULL) {
7431 PyCodec_StrictErrors(exc);
7432 Py_CLEAR(exc);
7433 }
7434 goto error;
7435 }
7436
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007437 /* Extend a wchar_t* buffer */
7438 Py_ssize_t n = *bufsize; /* Get the current length */
7439 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7440 PyErr_NoMemory();
7441 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007442 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007443 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7444 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007445 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007446 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007447
7448 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007449 while (in < endin)
7450 {
7451 /* Decode a character */
7452 insize = 1;
7453 do
7454 {
7455 outsize = MultiByteToWideChar(code_page, flags,
7456 in, insize,
7457 buffer, Py_ARRAY_LENGTH(buffer));
7458 if (outsize > 0)
7459 break;
7460 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007461 if (err == ERROR_INVALID_FLAGS && flags) {
7462 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7463 flags = 0;
7464 continue;
7465 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007466 if (err != ERROR_NO_UNICODE_TRANSLATION
7467 && err != ERROR_INSUFFICIENT_BUFFER)
7468 {
7469 PyErr_SetFromWindowsErr(0);
7470 goto error;
7471 }
7472 insize++;
7473 }
7474 /* 4=maximum length of a UTF-8 sequence */
7475 while (insize <= 4 && (in + insize) <= endin);
7476
7477 if (outsize <= 0) {
7478 Py_ssize_t startinpos, endinpos, outpos;
7479
Victor Stinner7d00cc12014-03-17 23:08:06 +01007480 /* last character in partial decode? */
7481 if (in + insize >= endin && !final)
7482 break;
7483
Victor Stinner3a50e702011-10-18 21:21:00 +02007484 startinpos = in - startin;
7485 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007486 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007487 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007488 errors, &errorHandler,
7489 encoding, reason,
7490 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007491 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007492 {
7493 goto error;
7494 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007495 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007496 }
7497 else {
7498 in += insize;
7499 memcpy(out, buffer, outsize * sizeof(wchar_t));
7500 out += outsize;
7501 }
7502 }
7503
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007504 /* Shrink the buffer */
7505 assert(out - *buf <= *bufsize);
7506 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007507 /* (in - startin) <= size and size is an int */
7508 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007509
7510error:
7511 Py_XDECREF(encoding_obj);
7512 Py_XDECREF(errorHandler);
7513 Py_XDECREF(exc);
7514 return ret;
7515}
7516
Victor Stinner3a50e702011-10-18 21:21:00 +02007517static PyObject *
7518decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007519 const char *s, Py_ssize_t size,
7520 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007521{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007522 wchar_t *buf = NULL;
7523 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007524 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007525
Victor Stinner3a50e702011-10-18 21:21:00 +02007526 if (code_page < 0) {
7527 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7528 return NULL;
7529 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007530 if (size < 0) {
7531 PyErr_BadInternalCall();
7532 return NULL;
7533 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007534
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007535 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007536 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007537
Victor Stinner76a31a62011-11-04 00:05:13 +01007538 do
7539 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007540#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007541 if (size > DECODING_CHUNK_SIZE) {
7542 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007543 final = 0;
7544 done = 0;
7545 }
7546 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007547#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007548 {
7549 chunk_size = (int)size;
7550 final = (consumed == NULL);
7551 done = 1;
7552 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007553
Victor Stinner76a31a62011-11-04 00:05:13 +01007554 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007555 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007556 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007557 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007558 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007559
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007560 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007561 s, chunk_size);
7562 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007563 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007564 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007565 errors, final);
7566 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007567
7568 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007569 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007570 return NULL;
7571 }
7572
7573 if (consumed)
7574 *consumed += converted;
7575
7576 s += converted;
7577 size -= converted;
7578 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007579
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007580 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7581 PyMem_Free(buf);
7582 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007583}
7584
Alexander Belopolsky40018472011-02-26 01:02:56 +00007585PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007586PyUnicode_DecodeCodePageStateful(int code_page,
7587 const char *s,
7588 Py_ssize_t size,
7589 const char *errors,
7590 Py_ssize_t *consumed)
7591{
7592 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7593}
7594
7595PyObject *
7596PyUnicode_DecodeMBCSStateful(const char *s,
7597 Py_ssize_t size,
7598 const char *errors,
7599 Py_ssize_t *consumed)
7600{
7601 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7602}
7603
7604PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007605PyUnicode_DecodeMBCS(const char *s,
7606 Py_ssize_t size,
7607 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007608{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007609 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7610}
7611
Victor Stinner3a50e702011-10-18 21:21:00 +02007612static DWORD
7613encode_code_page_flags(UINT code_page, const char *errors)
7614{
7615 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007616 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007617 }
7618 else if (code_page == CP_UTF7) {
7619 /* CP_UTF7 only supports flags=0 */
7620 return 0;
7621 }
7622 else {
7623 if (errors != NULL && strcmp(errors, "replace") == 0)
7624 return 0;
7625 else
7626 return WC_NO_BEST_FIT_CHARS;
7627 }
7628}
7629
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007630/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007631 * Encode a Unicode string to a Windows code page into a byte string in strict
7632 * mode.
7633 *
7634 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007635 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007636 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007637static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007638encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007639 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007640 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007641{
Victor Stinner554f3f02010-06-16 23:33:54 +00007642 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007643 BOOL *pusedDefaultChar = &usedDefaultChar;
7644 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007645 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007646 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007647 const DWORD flags = encode_code_page_flags(code_page, NULL);
7648 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007649 /* Create a substring so that we can get the UTF-16 representation
7650 of just the slice under consideration. */
7651 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007652
Martin v. Löwis3d325192011-11-04 18:23:06 +01007653 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007654
Victor Stinner3a50e702011-10-18 21:21:00 +02007655 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007656 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007657 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007658 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007659
Victor Stinner2fc507f2011-11-04 20:06:39 +01007660 substring = PyUnicode_Substring(unicode, offset, offset+len);
7661 if (substring == NULL)
7662 return -1;
7663 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7664 if (p == NULL) {
7665 Py_DECREF(substring);
7666 return -1;
7667 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007668 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007669
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007670 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007671 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007672 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007673 NULL, 0,
7674 NULL, pusedDefaultChar);
7675 if (outsize <= 0)
7676 goto error;
7677 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007678 if (pusedDefaultChar && *pusedDefaultChar) {
7679 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007680 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007681 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007682
Victor Stinner3a50e702011-10-18 21:21:00 +02007683 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007684 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007685 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007686 if (*outbytes == NULL) {
7687 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007688 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007689 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007690 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007691 }
7692 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007693 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007694 const Py_ssize_t n = PyBytes_Size(*outbytes);
7695 if (outsize > PY_SSIZE_T_MAX - n) {
7696 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007697 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007698 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007699 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007700 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7701 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007702 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007703 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007704 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007705 }
7706
7707 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007708 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007709 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007710 out, outsize,
7711 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007712 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007713 if (outsize <= 0)
7714 goto error;
7715 if (pusedDefaultChar && *pusedDefaultChar)
7716 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007717 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007718
Victor Stinner3a50e702011-10-18 21:21:00 +02007719error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007720 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007721 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7722 return -2;
7723 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007724 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007725}
7726
Victor Stinner3a50e702011-10-18 21:21:00 +02007727/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007728 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007729 * error handler.
7730 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007731 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007732 * -1 on other error.
7733 */
7734static int
7735encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007736 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007737 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007738{
Victor Stinner3a50e702011-10-18 21:21:00 +02007739 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007740 Py_ssize_t pos = unicode_offset;
7741 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007742 /* Ideally, we should get reason from FormatMessage. This is the Windows
7743 2000 English version of the message. */
7744 const char *reason = "invalid character";
7745 /* 4=maximum length of a UTF-8 sequence */
7746 char buffer[4];
7747 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7748 Py_ssize_t outsize;
7749 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007750 PyObject *errorHandler = NULL;
7751 PyObject *exc = NULL;
7752 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007753 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007754 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007755 PyObject *rep;
7756 int ret = -1;
7757
7758 assert(insize > 0);
7759
7760 encoding = code_page_name(code_page, &encoding_obj);
7761 if (encoding == NULL)
7762 return -1;
7763
7764 if (errors == NULL || strcmp(errors, "strict") == 0) {
7765 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7766 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007767 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007768 if (exc != NULL) {
7769 PyCodec_StrictErrors(exc);
7770 Py_DECREF(exc);
7771 }
7772 Py_XDECREF(encoding_obj);
7773 return -1;
7774 }
7775
7776 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7777 pusedDefaultChar = &usedDefaultChar;
7778 else
7779 pusedDefaultChar = NULL;
7780
7781 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7782 PyErr_NoMemory();
7783 goto error;
7784 }
7785 outsize = insize * Py_ARRAY_LENGTH(buffer);
7786
7787 if (*outbytes == NULL) {
7788 /* Create string object */
7789 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7790 if (*outbytes == NULL)
7791 goto error;
7792 out = PyBytes_AS_STRING(*outbytes);
7793 }
7794 else {
7795 /* Extend string object */
7796 Py_ssize_t n = PyBytes_Size(*outbytes);
7797 if (n > PY_SSIZE_T_MAX - outsize) {
7798 PyErr_NoMemory();
7799 goto error;
7800 }
7801 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7802 goto error;
7803 out = PyBytes_AS_STRING(*outbytes) + n;
7804 }
7805
7806 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007807 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007808 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007809 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7810 wchar_t chars[2];
7811 int charsize;
7812 if (ch < 0x10000) {
7813 chars[0] = (wchar_t)ch;
7814 charsize = 1;
7815 }
7816 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007817 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7818 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007819 charsize = 2;
7820 }
7821
Victor Stinner3a50e702011-10-18 21:21:00 +02007822 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007823 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007824 buffer, Py_ARRAY_LENGTH(buffer),
7825 NULL, pusedDefaultChar);
7826 if (outsize > 0) {
7827 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7828 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007829 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007830 memcpy(out, buffer, outsize);
7831 out += outsize;
7832 continue;
7833 }
7834 }
7835 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7836 PyErr_SetFromWindowsErr(0);
7837 goto error;
7838 }
7839
Victor Stinner3a50e702011-10-18 21:21:00 +02007840 rep = unicode_encode_call_errorhandler(
7841 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007842 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007843 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007844 if (rep == NULL)
7845 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007846 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007847
7848 if (PyBytes_Check(rep)) {
7849 outsize = PyBytes_GET_SIZE(rep);
7850 if (outsize != 1) {
7851 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7852 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7853 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7854 Py_DECREF(rep);
7855 goto error;
7856 }
7857 out = PyBytes_AS_STRING(*outbytes) + offset;
7858 }
7859 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7860 out += outsize;
7861 }
7862 else {
7863 Py_ssize_t i;
7864 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007865 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02007866
Benjamin Petersonbac79492012-01-14 13:34:47 -05007867 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007868 Py_DECREF(rep);
7869 goto error;
7870 }
7871
7872 outsize = PyUnicode_GET_LENGTH(rep);
7873 if (outsize != 1) {
7874 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7875 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7876 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7877 Py_DECREF(rep);
7878 goto error;
7879 }
7880 out = PyBytes_AS_STRING(*outbytes) + offset;
7881 }
7882 kind = PyUnicode_KIND(rep);
7883 data = PyUnicode_DATA(rep);
7884 for (i=0; i < outsize; i++) {
7885 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7886 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007887 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007888 encoding, unicode,
7889 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007890 "unable to encode error handler result to ASCII");
7891 Py_DECREF(rep);
7892 goto error;
7893 }
7894 *out = (unsigned char)ch;
7895 out++;
7896 }
7897 }
7898 Py_DECREF(rep);
7899 }
7900 /* write a NUL byte */
7901 *out = 0;
7902 outsize = out - PyBytes_AS_STRING(*outbytes);
7903 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7904 if (_PyBytes_Resize(outbytes, outsize) < 0)
7905 goto error;
7906 ret = 0;
7907
7908error:
7909 Py_XDECREF(encoding_obj);
7910 Py_XDECREF(errorHandler);
7911 Py_XDECREF(exc);
7912 return ret;
7913}
7914
Victor Stinner3a50e702011-10-18 21:21:00 +02007915static PyObject *
7916encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007917 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007918 const char *errors)
7919{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007920 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007921 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007922 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007923 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007924
Victor Stinner29dacf22015-01-26 16:41:32 +01007925 if (!PyUnicode_Check(unicode)) {
7926 PyErr_BadArgument();
7927 return NULL;
7928 }
7929
Benjamin Petersonbac79492012-01-14 13:34:47 -05007930 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007931 return NULL;
7932 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007933
Victor Stinner3a50e702011-10-18 21:21:00 +02007934 if (code_page < 0) {
7935 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7936 return NULL;
7937 }
7938
Martin v. Löwis3d325192011-11-04 18:23:06 +01007939 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007940 return PyBytes_FromStringAndSize(NULL, 0);
7941
Victor Stinner7581cef2011-11-03 22:32:33 +01007942 offset = 0;
7943 do
7944 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007945#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007946 if (len > DECODING_CHUNK_SIZE) {
7947 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007948 done = 0;
7949 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007950 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007951#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007952 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007953 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007954 done = 1;
7955 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007956
Victor Stinner76a31a62011-11-04 00:05:13 +01007957 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007958 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007959 errors);
7960 if (ret == -2)
7961 ret = encode_code_page_errors(code_page, &outbytes,
7962 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007963 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007964 if (ret < 0) {
7965 Py_XDECREF(outbytes);
7966 return NULL;
7967 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007968
Victor Stinner7581cef2011-11-03 22:32:33 +01007969 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007970 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007971 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007972
Victor Stinner3a50e702011-10-18 21:21:00 +02007973 return outbytes;
7974}
7975
7976PyObject *
7977PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7978 Py_ssize_t size,
7979 const char *errors)
7980{
Victor Stinner7581cef2011-11-03 22:32:33 +01007981 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007982 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007983 if (unicode == NULL)
7984 return NULL;
7985 res = encode_code_page(CP_ACP, unicode, errors);
7986 Py_DECREF(unicode);
7987 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007988}
7989
7990PyObject *
7991PyUnicode_EncodeCodePage(int code_page,
7992 PyObject *unicode,
7993 const char *errors)
7994{
Victor Stinner7581cef2011-11-03 22:32:33 +01007995 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007996}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007997
Alexander Belopolsky40018472011-02-26 01:02:56 +00007998PyObject *
7999PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008000{
Victor Stinner7581cef2011-11-03 22:32:33 +01008001 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008002}
8003
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008004#undef NEED_RETRY
8005
Steve Dowercc16be82016-09-08 10:35:16 -07008006#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008007
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008/* --- Character Mapping Codec -------------------------------------------- */
8009
Victor Stinnerfb161b12013-04-18 01:44:27 +02008010static int
8011charmap_decode_string(const char *s,
8012 Py_ssize_t size,
8013 PyObject *mapping,
8014 const char *errors,
8015 _PyUnicodeWriter *writer)
8016{
8017 const char *starts = s;
8018 const char *e;
8019 Py_ssize_t startinpos, endinpos;
8020 PyObject *errorHandler = NULL, *exc = NULL;
8021 Py_ssize_t maplen;
8022 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008023 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008024 Py_UCS4 x;
8025 unsigned char ch;
8026
8027 if (PyUnicode_READY(mapping) == -1)
8028 return -1;
8029
8030 maplen = PyUnicode_GET_LENGTH(mapping);
8031 mapdata = PyUnicode_DATA(mapping);
8032 mapkind = PyUnicode_KIND(mapping);
8033
8034 e = s + size;
8035
8036 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8037 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8038 * is disabled in encoding aliases, latin1 is preferred because
8039 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008040 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008041 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8042 Py_UCS4 maxchar = writer->maxchar;
8043
8044 assert (writer->kind == PyUnicode_1BYTE_KIND);
8045 while (s < e) {
8046 ch = *s;
8047 x = mapdata_ucs1[ch];
8048 if (x > maxchar) {
8049 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8050 goto onError;
8051 maxchar = writer->maxchar;
8052 outdata = (Py_UCS1 *)writer->data;
8053 }
8054 outdata[writer->pos] = x;
8055 writer->pos++;
8056 ++s;
8057 }
8058 return 0;
8059 }
8060
8061 while (s < e) {
8062 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8063 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008064 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008065 if (outkind == PyUnicode_1BYTE_KIND) {
8066 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8067 Py_UCS4 maxchar = writer->maxchar;
8068 while (s < e) {
8069 ch = *s;
8070 x = mapdata_ucs2[ch];
8071 if (x > maxchar)
8072 goto Error;
8073 outdata[writer->pos] = x;
8074 writer->pos++;
8075 ++s;
8076 }
8077 break;
8078 }
8079 else if (outkind == PyUnicode_2BYTE_KIND) {
8080 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8081 while (s < e) {
8082 ch = *s;
8083 x = mapdata_ucs2[ch];
8084 if (x == 0xFFFE)
8085 goto Error;
8086 outdata[writer->pos] = x;
8087 writer->pos++;
8088 ++s;
8089 }
8090 break;
8091 }
8092 }
8093 ch = *s;
8094
8095 if (ch < maplen)
8096 x = PyUnicode_READ(mapkind, mapdata, ch);
8097 else
8098 x = 0xfffe; /* invalid value */
8099Error:
8100 if (x == 0xfffe)
8101 {
8102 /* undefined mapping */
8103 startinpos = s-starts;
8104 endinpos = startinpos+1;
8105 if (unicode_decode_call_errorhandler_writer(
8106 errors, &errorHandler,
8107 "charmap", "character maps to <undefined>",
8108 &starts, &e, &startinpos, &endinpos, &exc, &s,
8109 writer)) {
8110 goto onError;
8111 }
8112 continue;
8113 }
8114
8115 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8116 goto onError;
8117 ++s;
8118 }
8119 Py_XDECREF(errorHandler);
8120 Py_XDECREF(exc);
8121 return 0;
8122
8123onError:
8124 Py_XDECREF(errorHandler);
8125 Py_XDECREF(exc);
8126 return -1;
8127}
8128
8129static int
8130charmap_decode_mapping(const char *s,
8131 Py_ssize_t size,
8132 PyObject *mapping,
8133 const char *errors,
8134 _PyUnicodeWriter *writer)
8135{
8136 const char *starts = s;
8137 const char *e;
8138 Py_ssize_t startinpos, endinpos;
8139 PyObject *errorHandler = NULL, *exc = NULL;
8140 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008141 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008142
8143 e = s + size;
8144
8145 while (s < e) {
8146 ch = *s;
8147
8148 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8149 key = PyLong_FromLong((long)ch);
8150 if (key == NULL)
8151 goto onError;
8152
8153 item = PyObject_GetItem(mapping, key);
8154 Py_DECREF(key);
8155 if (item == NULL) {
8156 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8157 /* No mapping found means: mapping is undefined. */
8158 PyErr_Clear();
8159 goto Undefined;
8160 } else
8161 goto onError;
8162 }
8163
8164 /* Apply mapping */
8165 if (item == Py_None)
8166 goto Undefined;
8167 if (PyLong_Check(item)) {
8168 long value = PyLong_AS_LONG(item);
8169 if (value == 0xFFFE)
8170 goto Undefined;
8171 if (value < 0 || value > MAX_UNICODE) {
8172 PyErr_Format(PyExc_TypeError,
8173 "character mapping must be in range(0x%lx)",
8174 (unsigned long)MAX_UNICODE + 1);
8175 goto onError;
8176 }
8177
8178 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8179 goto onError;
8180 }
8181 else if (PyUnicode_Check(item)) {
8182 if (PyUnicode_READY(item) == -1)
8183 goto onError;
8184 if (PyUnicode_GET_LENGTH(item) == 1) {
8185 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8186 if (value == 0xFFFE)
8187 goto Undefined;
8188 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8189 goto onError;
8190 }
8191 else {
8192 writer->overallocate = 1;
8193 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8194 goto onError;
8195 }
8196 }
8197 else {
8198 /* wrong return value */
8199 PyErr_SetString(PyExc_TypeError,
8200 "character mapping must return integer, None or str");
8201 goto onError;
8202 }
8203 Py_CLEAR(item);
8204 ++s;
8205 continue;
8206
8207Undefined:
8208 /* undefined mapping */
8209 Py_CLEAR(item);
8210 startinpos = s-starts;
8211 endinpos = startinpos+1;
8212 if (unicode_decode_call_errorhandler_writer(
8213 errors, &errorHandler,
8214 "charmap", "character maps to <undefined>",
8215 &starts, &e, &startinpos, &endinpos, &exc, &s,
8216 writer)) {
8217 goto onError;
8218 }
8219 }
8220 Py_XDECREF(errorHandler);
8221 Py_XDECREF(exc);
8222 return 0;
8223
8224onError:
8225 Py_XDECREF(item);
8226 Py_XDECREF(errorHandler);
8227 Py_XDECREF(exc);
8228 return -1;
8229}
8230
Alexander Belopolsky40018472011-02-26 01:02:56 +00008231PyObject *
8232PyUnicode_DecodeCharmap(const char *s,
8233 Py_ssize_t size,
8234 PyObject *mapping,
8235 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008237 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008238
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239 /* Default to Latin-1 */
8240 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008244 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008245 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008246 writer.min_length = size;
8247 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008248 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008249
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008250 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008251 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8252 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008253 }
8254 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008255 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8256 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008258 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008259
Benjamin Peterson29060642009-01-31 22:14:21 +00008260 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008261 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262 return NULL;
8263}
8264
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008265/* Charmap encoding: the lookup table */
8266
Alexander Belopolsky40018472011-02-26 01:02:56 +00008267struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 PyObject_HEAD
8269 unsigned char level1[32];
8270 int count2, count3;
8271 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008272};
8273
8274static PyObject*
8275encoding_map_size(PyObject *obj, PyObject* args)
8276{
8277 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008278 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008280}
8281
8282static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008283 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 PyDoc_STR("Return the size (in bytes) of this object") },
8285 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008286};
8287
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008288static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008289 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008290 "EncodingMap", /*tp_name*/
8291 sizeof(struct encoding_map), /*tp_basicsize*/
8292 0, /*tp_itemsize*/
8293 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008294 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008295 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 0, /*tp_getattr*/
8297 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008298 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008299 0, /*tp_repr*/
8300 0, /*tp_as_number*/
8301 0, /*tp_as_sequence*/
8302 0, /*tp_as_mapping*/
8303 0, /*tp_hash*/
8304 0, /*tp_call*/
8305 0, /*tp_str*/
8306 0, /*tp_getattro*/
8307 0, /*tp_setattro*/
8308 0, /*tp_as_buffer*/
8309 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8310 0, /*tp_doc*/
8311 0, /*tp_traverse*/
8312 0, /*tp_clear*/
8313 0, /*tp_richcompare*/
8314 0, /*tp_weaklistoffset*/
8315 0, /*tp_iter*/
8316 0, /*tp_iternext*/
8317 encoding_map_methods, /*tp_methods*/
8318 0, /*tp_members*/
8319 0, /*tp_getset*/
8320 0, /*tp_base*/
8321 0, /*tp_dict*/
8322 0, /*tp_descr_get*/
8323 0, /*tp_descr_set*/
8324 0, /*tp_dictoffset*/
8325 0, /*tp_init*/
8326 0, /*tp_alloc*/
8327 0, /*tp_new*/
8328 0, /*tp_free*/
8329 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008330};
8331
8332PyObject*
8333PyUnicode_BuildEncodingMap(PyObject* string)
8334{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008335 PyObject *result;
8336 struct encoding_map *mresult;
8337 int i;
8338 int need_dict = 0;
8339 unsigned char level1[32];
8340 unsigned char level2[512];
8341 unsigned char *mlevel1, *mlevel2, *mlevel3;
8342 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008343 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008344 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008345 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008346 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008347
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008348 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008349 PyErr_BadArgument();
8350 return NULL;
8351 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008352 kind = PyUnicode_KIND(string);
8353 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008354 length = PyUnicode_GET_LENGTH(string);
8355 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008356 memset(level1, 0xFF, sizeof level1);
8357 memset(level2, 0xFF, sizeof level2);
8358
8359 /* If there isn't a one-to-one mapping of NULL to \0,
8360 or if there are non-BMP characters, we need to use
8361 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008362 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008363 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008364 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008365 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008366 ch = PyUnicode_READ(kind, data, i);
8367 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008368 need_dict = 1;
8369 break;
8370 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008371 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008372 /* unmapped character */
8373 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008374 l1 = ch >> 11;
8375 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008376 if (level1[l1] == 0xFF)
8377 level1[l1] = count2++;
8378 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008379 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008380 }
8381
8382 if (count2 >= 0xFF || count3 >= 0xFF)
8383 need_dict = 1;
8384
8385 if (need_dict) {
8386 PyObject *result = PyDict_New();
8387 PyObject *key, *value;
8388 if (!result)
8389 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008390 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008391 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008392 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008393 if (!key || !value)
8394 goto failed1;
8395 if (PyDict_SetItem(result, key, value) == -1)
8396 goto failed1;
8397 Py_DECREF(key);
8398 Py_DECREF(value);
8399 }
8400 return result;
8401 failed1:
8402 Py_XDECREF(key);
8403 Py_XDECREF(value);
8404 Py_DECREF(result);
8405 return NULL;
8406 }
8407
8408 /* Create a three-level trie */
8409 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8410 16*count2 + 128*count3 - 1);
Victor Stinner04fc4f22020-06-16 01:28:07 +02008411 if (!result) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008412 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02008413 }
8414
8415 _PyObject_Init(result, &EncodingMapType);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008416 mresult = (struct encoding_map*)result;
8417 mresult->count2 = count2;
8418 mresult->count3 = count3;
8419 mlevel1 = mresult->level1;
8420 mlevel2 = mresult->level23;
8421 mlevel3 = mresult->level23 + 16*count2;
8422 memcpy(mlevel1, level1, 32);
8423 memset(mlevel2, 0xFF, 16*count2);
8424 memset(mlevel3, 0, 128*count3);
8425 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008426 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008427 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008428 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8429 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008430 /* unmapped character */
8431 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008432 o1 = ch>>11;
8433 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008434 i2 = 16*mlevel1[o1] + o2;
8435 if (mlevel2[i2] == 0xFF)
8436 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008437 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008438 i3 = 128*mlevel2[i2] + o3;
8439 mlevel3[i3] = i;
8440 }
8441 return result;
8442}
8443
8444static int
Victor Stinner22168992011-11-20 17:09:18 +01008445encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008446{
8447 struct encoding_map *map = (struct encoding_map*)mapping;
8448 int l1 = c>>11;
8449 int l2 = (c>>7) & 0xF;
8450 int l3 = c & 0x7F;
8451 int i;
8452
Victor Stinner22168992011-11-20 17:09:18 +01008453 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008455 if (c == 0)
8456 return 0;
8457 /* level 1*/
8458 i = map->level1[l1];
8459 if (i == 0xFF) {
8460 return -1;
8461 }
8462 /* level 2*/
8463 i = map->level23[16*i+l2];
8464 if (i == 0xFF) {
8465 return -1;
8466 }
8467 /* level 3 */
8468 i = map->level23[16*map->count2 + 128*i + l3];
8469 if (i == 0) {
8470 return -1;
8471 }
8472 return i;
8473}
8474
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008475/* Lookup the character ch in the mapping. If the character
8476 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008477 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008478static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008479charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008480{
Christian Heimes217cfd12007-12-02 14:31:20 +00008481 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008482 PyObject *x;
8483
8484 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008485 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008486 x = PyObject_GetItem(mapping, w);
8487 Py_DECREF(w);
8488 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008489 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8490 /* No mapping found means: mapping is undefined. */
8491 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008492 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008493 } else
8494 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008495 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008496 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008497 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008498 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 long value = PyLong_AS_LONG(x);
8500 if (value < 0 || value > 255) {
8501 PyErr_SetString(PyExc_TypeError,
8502 "character mapping must be in range(256)");
8503 Py_DECREF(x);
8504 return NULL;
8505 }
8506 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008507 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008508 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008510 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008511 /* wrong return value */
8512 PyErr_Format(PyExc_TypeError,
8513 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008514 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 Py_DECREF(x);
8516 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008517 }
8518}
8519
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008520static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008521charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008522{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008523 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8524 /* exponentially overallocate to minimize reallocations */
8525 if (requiredsize < 2*outsize)
8526 requiredsize = 2*outsize;
8527 if (_PyBytes_Resize(outobj, requiredsize))
8528 return -1;
8529 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008530}
8531
Benjamin Peterson14339b62009-01-31 16:36:08 +00008532typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008533 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008534} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008535/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008536 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008537 space is available. Return a new reference to the object that
8538 was put in the output buffer, or Py_None, if the mapping was undefined
8539 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008540 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008541static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008542charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008543 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008544{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008545 PyObject *rep;
8546 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008547 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008548
Andy Lesterdffe4c02020-03-04 07:15:20 -06008549 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008550 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008551 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008552 if (res == -1)
8553 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008554 if (outsize<requiredsize)
8555 if (charmapencode_resize(outobj, outpos, requiredsize))
8556 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008557 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008558 outstart[(*outpos)++] = (char)res;
8559 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008560 }
8561
8562 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008563 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008565 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 Py_DECREF(rep);
8567 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008568 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008569 if (PyLong_Check(rep)) {
8570 Py_ssize_t requiredsize = *outpos+1;
8571 if (outsize<requiredsize)
8572 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8573 Py_DECREF(rep);
8574 return enc_EXCEPTION;
8575 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008576 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008578 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008579 else {
8580 const char *repchars = PyBytes_AS_STRING(rep);
8581 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8582 Py_ssize_t requiredsize = *outpos+repsize;
8583 if (outsize<requiredsize)
8584 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8585 Py_DECREF(rep);
8586 return enc_EXCEPTION;
8587 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008588 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008589 memcpy(outstart + *outpos, repchars, repsize);
8590 *outpos += repsize;
8591 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008592 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008593 Py_DECREF(rep);
8594 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008595}
8596
8597/* handle an error in PyUnicode_EncodeCharmap
8598 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008599static int
8600charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008601 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008602 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008603 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008604 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008605{
8606 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008607 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008608 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008609 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008610 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008611 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008612 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008613 Py_ssize_t collstartpos = *inpos;
8614 Py_ssize_t collendpos = *inpos+1;
8615 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008616 const char *encoding = "charmap";
8617 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008618 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008619 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008620 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008621
Benjamin Petersonbac79492012-01-14 13:34:47 -05008622 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008623 return -1;
8624 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008625 /* find all unencodable characters */
8626 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008627 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008628 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008629 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008630 val = encoding_map_lookup(ch, mapping);
8631 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 break;
8633 ++collendpos;
8634 continue;
8635 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008636
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008637 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8638 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008639 if (rep==NULL)
8640 return -1;
8641 else if (rep!=Py_None) {
8642 Py_DECREF(rep);
8643 break;
8644 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008645 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008647 }
8648 /* cache callback name lookup
8649 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008650 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008651 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008652
8653 switch (*error_handler) {
8654 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008655 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008656 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008657
8658 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008659 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008660 x = charmapencode_output('?', mapping, res, respos);
8661 if (x==enc_EXCEPTION) {
8662 return -1;
8663 }
8664 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008665 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008666 return -1;
8667 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008668 }
8669 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008670 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008671 *inpos = collendpos;
8672 break;
Victor Stinner50149202015-09-22 00:26:54 +02008673
8674 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008675 /* generate replacement (temporarily (mis)uses p) */
8676 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 char buffer[2+29+1+1];
8678 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008679 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008680 for (cp = buffer; *cp; ++cp) {
8681 x = charmapencode_output(*cp, mapping, res, respos);
8682 if (x==enc_EXCEPTION)
8683 return -1;
8684 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008685 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 return -1;
8687 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008688 }
8689 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008690 *inpos = collendpos;
8691 break;
Victor Stinner50149202015-09-22 00:26:54 +02008692
Benjamin Peterson14339b62009-01-31 16:36:08 +00008693 default:
Victor Stinner50149202015-09-22 00:26:54 +02008694 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008695 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008696 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008697 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008699 if (PyBytes_Check(repunicode)) {
8700 /* Directly copy bytes result to output. */
8701 Py_ssize_t outsize = PyBytes_Size(*res);
8702 Py_ssize_t requiredsize;
8703 repsize = PyBytes_Size(repunicode);
8704 requiredsize = *respos + repsize;
8705 if (requiredsize > outsize)
8706 /* Make room for all additional bytes. */
8707 if (charmapencode_resize(res, respos, requiredsize)) {
8708 Py_DECREF(repunicode);
8709 return -1;
8710 }
8711 memcpy(PyBytes_AsString(*res) + *respos,
8712 PyBytes_AsString(repunicode), repsize);
8713 *respos += repsize;
8714 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008715 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008716 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008717 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008718 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008719 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008720 Py_DECREF(repunicode);
8721 return -1;
8722 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008723 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008724 data = PyUnicode_DATA(repunicode);
8725 kind = PyUnicode_KIND(repunicode);
8726 for (index = 0; index < repsize; index++) {
8727 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8728 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008730 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 return -1;
8732 }
8733 else if (x==enc_FAILED) {
8734 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008735 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008736 return -1;
8737 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008738 }
8739 *inpos = newpos;
8740 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008741 }
8742 return 0;
8743}
8744
Alexander Belopolsky40018472011-02-26 01:02:56 +00008745PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008746_PyUnicode_EncodeCharmap(PyObject *unicode,
8747 PyObject *mapping,
8748 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008750 /* output object */
8751 PyObject *res = NULL;
8752 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008753 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008754 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008755 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008756 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008757 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008758 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008759 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008760 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008761 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008762
Benjamin Petersonbac79492012-01-14 13:34:47 -05008763 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008764 return NULL;
8765 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008766 data = PyUnicode_DATA(unicode);
8767 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008768
Guido van Rossumd57fd912000-03-10 22:53:23 +00008769 /* Default to Latin-1 */
8770 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008771 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008772
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008773 /* allocate enough for a simple encoding without
8774 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008775 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008776 if (res == NULL)
8777 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008778 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008779 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008780
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008781 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008782 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008783 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008784 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008785 if (x==enc_EXCEPTION) /* error */
8786 goto onError;
8787 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008788 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008789 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008790 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008791 &res, &respos)) {
8792 goto onError;
8793 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008794 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008795 else
8796 /* done with this character => adjust input position */
8797 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008798 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008799
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008800 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008801 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008802 if (_PyBytes_Resize(&res, respos) < 0)
8803 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008804
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008805 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008806 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008807 return res;
8808
Benjamin Peterson29060642009-01-31 22:14:21 +00008809 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008810 Py_XDECREF(res);
8811 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008812 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008813 return NULL;
8814}
8815
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008816/* Deprecated */
8817PyObject *
8818PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8819 Py_ssize_t size,
8820 PyObject *mapping,
8821 const char *errors)
8822{
8823 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008824 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008825 if (unicode == NULL)
8826 return NULL;
8827 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8828 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008829 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008830}
8831
Alexander Belopolsky40018472011-02-26 01:02:56 +00008832PyObject *
8833PyUnicode_AsCharmapString(PyObject *unicode,
8834 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008835{
8836 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008837 PyErr_BadArgument();
8838 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008839 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008840 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008841}
8842
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008843/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008844static void
8845make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008846 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008847 Py_ssize_t startpos, Py_ssize_t endpos,
8848 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008849{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008850 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008851 *exceptionObject = _PyUnicodeTranslateError_Create(
8852 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008853 }
8854 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008855 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8856 goto onError;
8857 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8858 goto onError;
8859 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8860 goto onError;
8861 return;
8862 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008863 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008864 }
8865}
8866
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008867/* error handling callback helper:
8868 build arguments, call the callback and check the arguments,
8869 put the result into newpos and return the replacement string, which
8870 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008871static PyObject *
8872unicode_translate_call_errorhandler(const char *errors,
8873 PyObject **errorHandler,
8874 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008875 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008876 Py_ssize_t startpos, Py_ssize_t endpos,
8877 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008878{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008879 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008880
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008881 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008882 PyObject *restuple;
8883 PyObject *resunicode;
8884
8885 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008886 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008887 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008888 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008889 }
8890
8891 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008892 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008893 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008894 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008895
Petr Viktorinffd97532020-02-11 17:46:57 +01008896 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008897 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008898 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008899 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008900 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008901 Py_DECREF(restuple);
8902 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008903 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008904 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008905 &resunicode, &i_newpos)) {
8906 Py_DECREF(restuple);
8907 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008908 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008909 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008910 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008911 else
8912 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008913 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008914 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008915 Py_DECREF(restuple);
8916 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008917 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008918 Py_INCREF(resunicode);
8919 Py_DECREF(restuple);
8920 return resunicode;
8921}
8922
8923/* Lookup the character ch in the mapping and put the result in result,
8924 which must be decrefed by the caller.
8925 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008926static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008928{
Christian Heimes217cfd12007-12-02 14:31:20 +00008929 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008930 PyObject *x;
8931
8932 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008933 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008934 x = PyObject_GetItem(mapping, w);
8935 Py_DECREF(w);
8936 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008937 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8938 /* No mapping found means: use 1:1 mapping. */
8939 PyErr_Clear();
8940 *result = NULL;
8941 return 0;
8942 } else
8943 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008944 }
8945 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008946 *result = x;
8947 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008948 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008949 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008950 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008951 if (value < 0 || value > MAX_UNICODE) {
8952 PyErr_Format(PyExc_ValueError,
8953 "character mapping must be in range(0x%x)",
8954 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008955 Py_DECREF(x);
8956 return -1;
8957 }
8958 *result = x;
8959 return 0;
8960 }
8961 else if (PyUnicode_Check(x)) {
8962 *result = x;
8963 return 0;
8964 }
8965 else {
8966 /* wrong return value */
8967 PyErr_SetString(PyExc_TypeError,
8968 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008969 Py_DECREF(x);
8970 return -1;
8971 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008972}
Victor Stinner1194ea02014-04-04 19:37:40 +02008973
8974/* lookup the character, write the result into the writer.
8975 Return 1 if the result was written into the writer, return 0 if the mapping
8976 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008977static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008978charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8979 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008980{
Victor Stinner1194ea02014-04-04 19:37:40 +02008981 PyObject *item;
8982
8983 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008984 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008985
8986 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008987 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008988 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008989 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008990 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008991 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008992 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008993
8994 if (item == Py_None) {
8995 Py_DECREF(item);
8996 return 0;
8997 }
8998
8999 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02009000 long ch = (Py_UCS4)PyLong_AS_LONG(item);
9001 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9002 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009003 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9004 Py_DECREF(item);
9005 return -1;
9006 }
9007 Py_DECREF(item);
9008 return 1;
9009 }
9010
9011 if (!PyUnicode_Check(item)) {
9012 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00009013 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009014 }
9015
9016 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9017 Py_DECREF(item);
9018 return -1;
9019 }
9020
9021 Py_DECREF(item);
9022 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009023}
9024
Victor Stinner89a76ab2014-04-05 11:44:04 +02009025static int
9026unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9027 Py_UCS1 *translate)
9028{
Benjamin Peterson1365de72014-04-07 20:15:41 -04009029 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009030 int ret = 0;
9031
Victor Stinner89a76ab2014-04-05 11:44:04 +02009032 if (charmaptranslate_lookup(ch, mapping, &item)) {
9033 return -1;
9034 }
9035
9036 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009037 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02009038 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009039 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009040 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009041 /* not found => default to 1:1 mapping */
9042 translate[ch] = ch;
9043 return 1;
9044 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009045 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02009046 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009047 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9048 used it */
9049 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009050 /* invalid character or character outside ASCII:
9051 skip the fast translate */
9052 goto exit;
9053 }
9054 translate[ch] = (Py_UCS1)replace;
9055 }
9056 else if (PyUnicode_Check(item)) {
9057 Py_UCS4 replace;
9058
9059 if (PyUnicode_READY(item) == -1) {
9060 Py_DECREF(item);
9061 return -1;
9062 }
9063 if (PyUnicode_GET_LENGTH(item) != 1)
9064 goto exit;
9065
9066 replace = PyUnicode_READ_CHAR(item, 0);
9067 if (replace > 127)
9068 goto exit;
9069 translate[ch] = (Py_UCS1)replace;
9070 }
9071 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009072 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009073 goto exit;
9074 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009075 ret = 1;
9076
Benjamin Peterson1365de72014-04-07 20:15:41 -04009077 exit:
9078 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009079 return ret;
9080}
9081
9082/* Fast path for ascii => ascii translation. Return 1 if the whole string
9083 was translated into writer, return 0 if the input string was partially
9084 translated into writer, raise an exception and return -1 on error. */
9085static int
9086unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009087 _PyUnicodeWriter *writer, int ignore,
9088 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009089{
Victor Stinner872b2912014-04-05 14:27:07 +02009090 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009091 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009092 const Py_UCS1 *in, *end;
9093 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009094 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009095
Victor Stinner89a76ab2014-04-05 11:44:04 +02009096 len = PyUnicode_GET_LENGTH(input);
9097
Victor Stinner872b2912014-04-05 14:27:07 +02009098 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009099
9100 in = PyUnicode_1BYTE_DATA(input);
9101 end = in + len;
9102
9103 assert(PyUnicode_IS_ASCII(writer->buffer));
9104 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9105 out = PyUnicode_1BYTE_DATA(writer->buffer);
9106
Victor Stinner872b2912014-04-05 14:27:07 +02009107 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009108 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009109 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009110 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009111 int translate = unicode_fast_translate_lookup(mapping, ch,
9112 ascii_table);
9113 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009114 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009115 if (translate == 0)
9116 goto exit;
9117 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009118 }
Victor Stinner872b2912014-04-05 14:27:07 +02009119 if (ch2 == 0xfe) {
9120 if (ignore)
9121 continue;
9122 goto exit;
9123 }
9124 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009125 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009126 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009127 }
Victor Stinner872b2912014-04-05 14:27:07 +02009128 res = 1;
9129
9130exit:
9131 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009132 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009133 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009134}
9135
Victor Stinner3222da22015-10-01 22:07:32 +02009136static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009137_PyUnicode_TranslateCharmap(PyObject *input,
9138 PyObject *mapping,
9139 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009140{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009141 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009142 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009143 Py_ssize_t size, i;
9144 int kind;
9145 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009146 _PyUnicodeWriter writer;
9147 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009148 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009149 PyObject *errorHandler = NULL;
9150 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009151 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009152 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009153
Guido van Rossumd57fd912000-03-10 22:53:23 +00009154 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009155 PyErr_BadArgument();
9156 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009159 if (PyUnicode_READY(input) == -1)
9160 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009161 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009162 kind = PyUnicode_KIND(input);
9163 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009164
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009165 if (size == 0)
9166 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009167
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009168 /* allocate enough for a simple 1:1 translation without
9169 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009170 _PyUnicodeWriter_Init(&writer);
9171 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009172 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009173
Victor Stinner872b2912014-04-05 14:27:07 +02009174 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9175
Victor Stinner33798672016-03-01 21:59:58 +01009176 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009177 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009178 if (PyUnicode_IS_ASCII(input)) {
9179 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9180 if (res < 0) {
9181 _PyUnicodeWriter_Dealloc(&writer);
9182 return NULL;
9183 }
9184 if (res == 1)
9185 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009186 }
Victor Stinner33798672016-03-01 21:59:58 +01009187 else {
9188 i = 0;
9189 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009191 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009192 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009193 int translate;
9194 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9195 Py_ssize_t newpos;
9196 /* startpos for collecting untranslatable chars */
9197 Py_ssize_t collstart;
9198 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009199 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009200
Victor Stinner1194ea02014-04-04 19:37:40 +02009201 ch = PyUnicode_READ(kind, data, i);
9202 translate = charmaptranslate_output(ch, mapping, &writer);
9203 if (translate < 0)
9204 goto onError;
9205
9206 if (translate != 0) {
9207 /* it worked => adjust input pointer */
9208 ++i;
9209 continue;
9210 }
9211
9212 /* untranslatable character */
9213 collstart = i;
9214 collend = i+1;
9215
9216 /* find all untranslatable characters */
9217 while (collend < size) {
9218 PyObject *x;
9219 ch = PyUnicode_READ(kind, data, collend);
9220 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009221 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009222 Py_XDECREF(x);
9223 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009224 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009225 ++collend;
9226 }
9227
9228 if (ignore) {
9229 i = collend;
9230 }
9231 else {
9232 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9233 reason, input, &exc,
9234 collstart, collend, &newpos);
9235 if (repunicode == NULL)
9236 goto onError;
9237 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009238 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009239 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009240 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009241 Py_DECREF(repunicode);
9242 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009243 }
9244 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009245 Py_XDECREF(exc);
9246 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009247 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009248
Benjamin Peterson29060642009-01-31 22:14:21 +00009249 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009250 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009251 Py_XDECREF(exc);
9252 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009253 return NULL;
9254}
9255
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009256/* Deprecated. Use PyUnicode_Translate instead. */
9257PyObject *
9258PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9259 Py_ssize_t size,
9260 PyObject *mapping,
9261 const char *errors)
9262{
Christian Heimes5f520f42012-09-11 14:03:25 +02009263 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009264 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009265 if (!unicode)
9266 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009267 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9268 Py_DECREF(unicode);
9269 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009270}
9271
Alexander Belopolsky40018472011-02-26 01:02:56 +00009272PyObject *
9273PyUnicode_Translate(PyObject *str,
9274 PyObject *mapping,
9275 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009276{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009277 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009278 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009279 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009280}
Tim Petersced69f82003-09-16 20:30:58 +00009281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009282PyObject *
9283_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9284{
9285 if (!PyUnicode_Check(unicode)) {
9286 PyErr_BadInternalCall();
9287 return NULL;
9288 }
9289 if (PyUnicode_READY(unicode) == -1)
9290 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009291 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009292 /* If the string is already ASCII, just return the same string */
9293 Py_INCREF(unicode);
9294 return unicode;
9295 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009296
9297 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9298 PyObject *result = PyUnicode_New(len, 127);
9299 if (result == NULL) {
9300 return NULL;
9301 }
9302
9303 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9304 int kind = PyUnicode_KIND(unicode);
9305 const void *data = PyUnicode_DATA(unicode);
9306 Py_ssize_t i;
9307 for (i = 0; i < len; ++i) {
9308 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9309 if (ch < 127) {
9310 out[i] = ch;
9311 }
9312 else if (Py_UNICODE_ISSPACE(ch)) {
9313 out[i] = ' ';
9314 }
9315 else {
9316 int decimal = Py_UNICODE_TODECIMAL(ch);
9317 if (decimal < 0) {
9318 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009319 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009320 _PyUnicode_LENGTH(result) = i + 1;
9321 break;
9322 }
9323 out[i] = '0' + decimal;
9324 }
9325 }
9326
INADA Naoki16dfca42018-07-14 12:06:43 +09009327 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009328 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329}
9330
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009331PyObject *
9332PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9333 Py_ssize_t length)
9334{
Victor Stinnerf0124502011-11-21 23:12:56 +01009335 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009336 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009337 Py_UCS4 maxchar;
9338 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009339 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009340
Victor Stinner99d7ad02012-02-22 13:37:39 +01009341 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009342 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009343 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009344 if (ch > 127) {
9345 int decimal = Py_UNICODE_TODECIMAL(ch);
9346 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009347 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009348 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009349 }
9350 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009351
9352 /* Copy to a new string */
9353 decimal = PyUnicode_New(length, maxchar);
9354 if (decimal == NULL)
9355 return decimal;
9356 kind = PyUnicode_KIND(decimal);
9357 data = PyUnicode_DATA(decimal);
9358 /* Iterate over code points */
9359 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009360 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009361 if (ch > 127) {
9362 int decimal = Py_UNICODE_TODECIMAL(ch);
9363 if (decimal >= 0)
9364 ch = '0' + decimal;
9365 }
9366 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009367 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009368 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009369}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009370/* --- Decimal Encoder ---------------------------------------------------- */
9371
Alexander Belopolsky40018472011-02-26 01:02:56 +00009372int
9373PyUnicode_EncodeDecimal(Py_UNICODE *s,
9374 Py_ssize_t length,
9375 char *output,
9376 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009377{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009378 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009379 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009380 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009381 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009382
9383 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009384 PyErr_BadArgument();
9385 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009386 }
9387
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009388 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009389 if (unicode == NULL)
9390 return -1;
9391
Victor Stinner42bf7752011-11-21 22:52:58 +01009392 kind = PyUnicode_KIND(unicode);
9393 data = PyUnicode_DATA(unicode);
9394
Victor Stinnerb84d7232011-11-22 01:50:07 +01009395 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009396 PyObject *exc;
9397 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009398 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009399 Py_ssize_t startpos;
9400
9401 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009402
Benjamin Peterson29060642009-01-31 22:14:21 +00009403 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009404 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009405 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009406 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009407 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009408 decimal = Py_UNICODE_TODECIMAL(ch);
9409 if (decimal >= 0) {
9410 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009411 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009412 continue;
9413 }
9414 if (0 < ch && ch < 256) {
9415 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009416 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009417 continue;
9418 }
Victor Stinner6345be92011-11-25 20:09:01 +01009419
Victor Stinner42bf7752011-11-21 22:52:58 +01009420 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009421 exc = NULL;
9422 raise_encode_exception(&exc, "decimal", unicode,
9423 startpos, startpos+1,
9424 "invalid decimal Unicode string");
9425 Py_XDECREF(exc);
9426 Py_DECREF(unicode);
9427 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009428 }
9429 /* 0-terminate the output string */
9430 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009431 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009432 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009433}
9434
Guido van Rossumd57fd912000-03-10 22:53:23 +00009435/* --- Helpers ------------------------------------------------------------ */
9436
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009437/* helper macro to fixup start/end slice values */
9438#define ADJUST_INDICES(start, end, len) \
9439 if (end > len) \
9440 end = len; \
9441 else if (end < 0) { \
9442 end += len; \
9443 if (end < 0) \
9444 end = 0; \
9445 } \
9446 if (start < 0) { \
9447 start += len; \
9448 if (start < 0) \
9449 start = 0; \
9450 }
9451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009452static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009453any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009455 Py_ssize_t end,
9456 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009457{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009458 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009459 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460 Py_ssize_t len1, len2, result;
9461
9462 kind1 = PyUnicode_KIND(s1);
9463 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009464 if (kind1 < kind2)
9465 return -1;
9466
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467 len1 = PyUnicode_GET_LENGTH(s1);
9468 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009469 ADJUST_INDICES(start, end, len1);
9470 if (end - start < len2)
9471 return -1;
9472
9473 buf1 = PyUnicode_DATA(s1);
9474 buf2 = PyUnicode_DATA(s2);
9475 if (len2 == 1) {
9476 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9477 result = findchar((const char *)buf1 + kind1*start,
9478 kind1, end - start, ch, direction);
9479 if (result == -1)
9480 return -1;
9481 else
9482 return start + result;
9483 }
9484
9485 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009486 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009487 if (!buf2)
9488 return -2;
9489 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490
Victor Stinner794d5672011-10-10 03:21:36 +02009491 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009492 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009493 case PyUnicode_1BYTE_KIND:
9494 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9495 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9496 else
9497 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9498 break;
9499 case PyUnicode_2BYTE_KIND:
9500 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9501 break;
9502 case PyUnicode_4BYTE_KIND:
9503 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9504 break;
9505 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009506 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009507 }
9508 }
9509 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009510 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009511 case PyUnicode_1BYTE_KIND:
9512 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9513 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9514 else
9515 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9516 break;
9517 case PyUnicode_2BYTE_KIND:
9518 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9519 break;
9520 case PyUnicode_4BYTE_KIND:
9521 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9522 break;
9523 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009524 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009525 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009526 }
9527
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009528 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009529 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009530 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009531
9532 return result;
9533}
9534
Victor Stinner59423e32018-11-26 13:40:01 +01009535/* _PyUnicode_InsertThousandsGrouping() helper functions */
9536#include "stringlib/localeutil.h"
9537
9538/**
9539 * InsertThousandsGrouping:
9540 * @writer: Unicode writer.
9541 * @n_buffer: Number of characters in @buffer.
9542 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9543 * @d_pos: Start of digits string.
9544 * @n_digits: The number of digits in the string, in which we want
9545 * to put the grouping chars.
9546 * @min_width: The minimum width of the digits in the output string.
9547 * Output will be zero-padded on the left to fill.
9548 * @grouping: see definition in localeconv().
9549 * @thousands_sep: see definition in localeconv().
9550 *
9551 * There are 2 modes: counting and filling. If @writer is NULL,
9552 * we are in counting mode, else filling mode.
9553 * If counting, the required buffer size is returned.
9554 * If filling, we know the buffer will be large enough, so we don't
9555 * need to pass in the buffer size.
9556 * Inserts thousand grouping characters (as defined by grouping and
9557 * thousands_sep) into @writer.
9558 *
9559 * Return value: -1 on error, number of characters otherwise.
9560 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009561Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009562_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009563 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009564 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009565 PyObject *digits,
9566 Py_ssize_t d_pos,
9567 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009568 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009569 const char *grouping,
9570 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009571 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009572{
Xtreak3f7983a2019-01-07 20:39:14 +05309573 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009574 if (writer) {
9575 assert(digits != NULL);
9576 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009577 }
9578 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009579 assert(digits == NULL);
9580 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009581 }
Victor Stinner59423e32018-11-26 13:40:01 +01009582 assert(0 <= d_pos);
9583 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009584 assert(grouping != NULL);
9585
9586 if (digits != NULL) {
9587 if (PyUnicode_READY(digits) == -1) {
9588 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009589 }
Victor Stinner59423e32018-11-26 13:40:01 +01009590 }
9591 if (PyUnicode_READY(thousands_sep) == -1) {
9592 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009593 }
9594
Victor Stinner59423e32018-11-26 13:40:01 +01009595 Py_ssize_t count = 0;
9596 Py_ssize_t n_zeros;
9597 int loop_broken = 0;
9598 int use_separator = 0; /* First time through, don't append the
9599 separator. They only go between
9600 groups. */
9601 Py_ssize_t buffer_pos;
9602 Py_ssize_t digits_pos;
9603 Py_ssize_t len;
9604 Py_ssize_t n_chars;
9605 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9606 be looked at */
9607 /* A generator that returns all of the grouping widths, until it
9608 returns 0. */
9609 GroupGenerator groupgen;
9610 GroupGenerator_init(&groupgen, grouping);
9611 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9612
9613 /* if digits are not grouped, thousands separator
9614 should be an empty string */
9615 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9616
9617 digits_pos = d_pos + n_digits;
9618 if (writer) {
9619 buffer_pos = writer->pos + n_buffer;
9620 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9621 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009622 }
Victor Stinner59423e32018-11-26 13:40:01 +01009623 else {
9624 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009625 }
Victor Stinner59423e32018-11-26 13:40:01 +01009626
9627 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009628 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009629 }
Victor Stinner59423e32018-11-26 13:40:01 +01009630
9631 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9632 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9633 n_zeros = Py_MAX(0, len - remaining);
9634 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9635
9636 /* Use n_zero zero's and n_chars chars */
9637
9638 /* Count only, don't do anything. */
9639 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9640
9641 /* Copy into the writer. */
9642 InsertThousandsGrouping_fill(writer, &buffer_pos,
9643 digits, &digits_pos,
9644 n_chars, n_zeros,
9645 use_separator ? thousands_sep : NULL,
9646 thousands_sep_len, maxchar);
9647
9648 /* Use a separator next time. */
9649 use_separator = 1;
9650
9651 remaining -= n_chars;
9652 min_width -= len;
9653
9654 if (remaining <= 0 && min_width <= 0) {
9655 loop_broken = 1;
9656 break;
9657 }
9658 min_width -= thousands_sep_len;
9659 }
9660 if (!loop_broken) {
9661 /* We left the loop without using a break statement. */
9662
9663 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9664 n_zeros = Py_MAX(0, len - remaining);
9665 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9666
9667 /* Use n_zero zero's and n_chars chars */
9668 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9669
9670 /* Copy into the writer. */
9671 InsertThousandsGrouping_fill(writer, &buffer_pos,
9672 digits, &digits_pos,
9673 n_chars, n_zeros,
9674 use_separator ? thousands_sep : NULL,
9675 thousands_sep_len, maxchar);
9676 }
9677 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009678}
9679
9680
Alexander Belopolsky40018472011-02-26 01:02:56 +00009681Py_ssize_t
9682PyUnicode_Count(PyObject *str,
9683 PyObject *substr,
9684 Py_ssize_t start,
9685 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009686{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009687 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009688 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009689 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009690 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009691
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009692 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009693 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009694
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009695 kind1 = PyUnicode_KIND(str);
9696 kind2 = PyUnicode_KIND(substr);
9697 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009698 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009699
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009700 len1 = PyUnicode_GET_LENGTH(str);
9701 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009702 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009703 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009704 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009705
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009706 buf1 = PyUnicode_DATA(str);
9707 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009708 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009709 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009710 if (!buf2)
9711 goto onError;
9712 }
9713
9714 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009715 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009716 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009717 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009718 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009719 buf2, len2, PY_SSIZE_T_MAX
9720 );
9721 else
9722 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009723 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009724 buf2, len2, PY_SSIZE_T_MAX
9725 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009726 break;
9727 case PyUnicode_2BYTE_KIND:
9728 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009729 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009730 buf2, len2, PY_SSIZE_T_MAX
9731 );
9732 break;
9733 case PyUnicode_4BYTE_KIND:
9734 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009735 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009736 buf2, len2, PY_SSIZE_T_MAX
9737 );
9738 break;
9739 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009740 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009741 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009742
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009743 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009744 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009745 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009746
Guido van Rossumd57fd912000-03-10 22:53:23 +00009747 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009748 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009749 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9750 if (kind2 != kind1)
9751 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009752 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009753}
9754
Alexander Belopolsky40018472011-02-26 01:02:56 +00009755Py_ssize_t
9756PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009757 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009758 Py_ssize_t start,
9759 Py_ssize_t end,
9760 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009761{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009762 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009763 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009764
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009765 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009766}
9767
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009768Py_ssize_t
9769PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9770 Py_ssize_t start, Py_ssize_t end,
9771 int direction)
9772{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009773 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009774 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009775 if (PyUnicode_READY(str) == -1)
9776 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009777 len = PyUnicode_GET_LENGTH(str);
9778 ADJUST_INDICES(start, end, len);
9779 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009780 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009781 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009782 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9783 kind, end-start, ch, direction);
9784 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009785 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009786 else
9787 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009788}
9789
Alexander Belopolsky40018472011-02-26 01:02:56 +00009790static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009791tailmatch(PyObject *self,
9792 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009793 Py_ssize_t start,
9794 Py_ssize_t end,
9795 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009796{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009797 int kind_self;
9798 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009799 const void *data_self;
9800 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009801 Py_ssize_t offset;
9802 Py_ssize_t i;
9803 Py_ssize_t end_sub;
9804
9805 if (PyUnicode_READY(self) == -1 ||
9806 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009807 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009809 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9810 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009811 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009812 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009813
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009814 if (PyUnicode_GET_LENGTH(substring) == 0)
9815 return 1;
9816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009817 kind_self = PyUnicode_KIND(self);
9818 data_self = PyUnicode_DATA(self);
9819 kind_sub = PyUnicode_KIND(substring);
9820 data_sub = PyUnicode_DATA(substring);
9821 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9822
9823 if (direction > 0)
9824 offset = end;
9825 else
9826 offset = start;
9827
9828 if (PyUnicode_READ(kind_self, data_self, offset) ==
9829 PyUnicode_READ(kind_sub, data_sub, 0) &&
9830 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9831 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9832 /* If both are of the same kind, memcmp is sufficient */
9833 if (kind_self == kind_sub) {
9834 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009835 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009836 data_sub,
9837 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009838 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009839 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009840 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009841 else {
9842 /* We do not need to compare 0 and len(substring)-1 because
9843 the if statement above ensured already that they are equal
9844 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009845 for (i = 1; i < end_sub; ++i) {
9846 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9847 PyUnicode_READ(kind_sub, data_sub, i))
9848 return 0;
9849 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009850 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009851 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009852 }
9853
9854 return 0;
9855}
9856
Alexander Belopolsky40018472011-02-26 01:02:56 +00009857Py_ssize_t
9858PyUnicode_Tailmatch(PyObject *str,
9859 PyObject *substr,
9860 Py_ssize_t start,
9861 Py_ssize_t end,
9862 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009863{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009864 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009865 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009866
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009867 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009868}
9869
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009870static PyObject *
9871ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009872{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009873 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009874 const char *data = PyUnicode_DATA(self);
9875 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009876 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009877
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009878 res = PyUnicode_New(len, 127);
9879 if (res == NULL)
9880 return NULL;
9881 resdata = PyUnicode_DATA(res);
9882 if (lower)
9883 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009884 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009885 _Py_bytes_upper(resdata, data, len);
9886 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009887}
9888
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009889static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009890handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009891{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009892 Py_ssize_t j;
9893 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009894 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009895 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009896
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009897 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9898
9899 where ! is a negation and \p{xxx} is a character with property xxx.
9900 */
9901 for (j = i - 1; j >= 0; j--) {
9902 c = PyUnicode_READ(kind, data, j);
9903 if (!_PyUnicode_IsCaseIgnorable(c))
9904 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009905 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009906 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9907 if (final_sigma) {
9908 for (j = i + 1; j < length; j++) {
9909 c = PyUnicode_READ(kind, data, j);
9910 if (!_PyUnicode_IsCaseIgnorable(c))
9911 break;
9912 }
9913 final_sigma = j == length || !_PyUnicode_IsCased(c);
9914 }
9915 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009916}
9917
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009918static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009919lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009920 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009921{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009922 /* Obscure special case. */
9923 if (c == 0x3A3) {
9924 mapped[0] = handle_capital_sigma(kind, data, length, i);
9925 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009926 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009927 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009928}
9929
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009930static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009931do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009932{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009933 Py_ssize_t i, k = 0;
9934 int n_res, j;
9935 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009936
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009937 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009938 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009939 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009940 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009941 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009942 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009943 for (i = 1; i < length; i++) {
9944 c = PyUnicode_READ(kind, data, i);
9945 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9946 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009947 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009948 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009949 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009950 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009951 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009952}
9953
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009954static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009955do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009956 Py_ssize_t i, k = 0;
9957
9958 for (i = 0; i < length; i++) {
9959 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9960 int n_res, j;
9961 if (Py_UNICODE_ISUPPER(c)) {
9962 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9963 }
9964 else if (Py_UNICODE_ISLOWER(c)) {
9965 n_res = _PyUnicode_ToUpperFull(c, mapped);
9966 }
9967 else {
9968 n_res = 1;
9969 mapped[0] = c;
9970 }
9971 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009972 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009973 res[k++] = mapped[j];
9974 }
9975 }
9976 return k;
9977}
9978
9979static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009980do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009981 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009982{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009983 Py_ssize_t i, k = 0;
9984
9985 for (i = 0; i < length; i++) {
9986 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9987 int n_res, j;
9988 if (lower)
9989 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9990 else
9991 n_res = _PyUnicode_ToUpperFull(c, mapped);
9992 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009993 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009994 res[k++] = mapped[j];
9995 }
9996 }
9997 return k;
9998}
9999
10000static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010001do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010002{
10003 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
10004}
10005
10006static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010007do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010008{
10009 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10010}
10011
Benjamin Petersone51757f2012-01-12 21:10:29 -050010012static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010013do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -050010014{
10015 Py_ssize_t i, k = 0;
10016
10017 for (i = 0; i < length; i++) {
10018 Py_UCS4 c = PyUnicode_READ(kind, data, i);
10019 Py_UCS4 mapped[3];
10020 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10021 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010022 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010023 res[k++] = mapped[j];
10024 }
10025 }
10026 return k;
10027}
10028
10029static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010030do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -050010031{
10032 Py_ssize_t i, k = 0;
10033 int previous_is_cased;
10034
10035 previous_is_cased = 0;
10036 for (i = 0; i < length; i++) {
10037 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10038 Py_UCS4 mapped[3];
10039 int n_res, j;
10040
10041 if (previous_is_cased)
10042 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10043 else
10044 n_res = _PyUnicode_ToTitleFull(c, mapped);
10045
10046 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010047 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -050010048 res[k++] = mapped[j];
10049 }
10050
10051 previous_is_cased = _PyUnicode_IsCased(c);
10052 }
10053 return k;
10054}
10055
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010056static PyObject *
10057case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010058 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010059{
10060 PyObject *res = NULL;
10061 Py_ssize_t length, newlength = 0;
10062 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010063 const void *data;
10064 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010065 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10066
Benjamin Petersoneea48462012-01-16 14:28:50 -050010067 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010068
10069 kind = PyUnicode_KIND(self);
10070 data = PyUnicode_DATA(self);
10071 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010072 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010073 PyErr_SetString(PyExc_OverflowError, "string is too long");
10074 return NULL;
10075 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -040010076 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010077 if (tmp == NULL)
10078 return PyErr_NoMemory();
10079 newlength = perform(kind, data, length, tmp, &maxchar);
10080 res = PyUnicode_New(newlength, maxchar);
10081 if (res == NULL)
10082 goto leave;
10083 tmpend = tmp + newlength;
10084 outdata = PyUnicode_DATA(res);
10085 outkind = PyUnicode_KIND(res);
10086 switch (outkind) {
10087 case PyUnicode_1BYTE_KIND:
10088 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10089 break;
10090 case PyUnicode_2BYTE_KIND:
10091 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10092 break;
10093 case PyUnicode_4BYTE_KIND:
10094 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10095 break;
10096 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010097 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010098 }
10099 leave:
10100 PyMem_FREE(tmp);
10101 return res;
10102}
10103
Tim Peters8ce9f162004-08-27 01:49:32 +000010104PyObject *
10105PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010106{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010107 PyObject *res;
10108 PyObject *fseq;
10109 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010110 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010111
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010112 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010113 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010114 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010115 }
10116
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010117 /* NOTE: the following code can't call back into Python code,
10118 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010119 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010120
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010121 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010122 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010123 res = _PyUnicode_JoinArray(separator, items, seqlen);
10124 Py_DECREF(fseq);
10125 return res;
10126}
10127
10128PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010129_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010130{
10131 PyObject *res = NULL; /* the result */
10132 PyObject *sep = NULL;
10133 Py_ssize_t seplen;
10134 PyObject *item;
10135 Py_ssize_t sz, i, res_offset;
10136 Py_UCS4 maxchar;
10137 Py_UCS4 item_maxchar;
10138 int use_memcpy;
10139 unsigned char *res_data = NULL, *sep_data = NULL;
10140 PyObject *last_obj;
10141 unsigned int kind = 0;
10142
Tim Peters05eba1f2004-08-27 21:32:02 +000010143 /* If empty sequence, return u"". */
10144 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010145 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010146 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010147
Tim Peters05eba1f2004-08-27 21:32:02 +000010148 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010149 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010150 if (seqlen == 1) {
10151 if (PyUnicode_CheckExact(items[0])) {
10152 res = items[0];
10153 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010154 return res;
10155 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010156 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010157 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010158 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010159 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010160 /* Set up sep and seplen */
10161 if (separator == NULL) {
10162 /* fall back to a blank space separator */
10163 sep = PyUnicode_FromOrdinal(' ');
10164 if (!sep)
10165 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010166 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010167 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010168 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010169 else {
10170 if (!PyUnicode_Check(separator)) {
10171 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010172 "separator: expected str instance,"
10173 " %.80s found",
10174 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010175 goto onError;
10176 }
10177 if (PyUnicode_READY(separator))
10178 goto onError;
10179 sep = separator;
10180 seplen = PyUnicode_GET_LENGTH(separator);
10181 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10182 /* inc refcount to keep this code path symmetric with the
10183 above case of a blank separator */
10184 Py_INCREF(sep);
10185 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010186 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010187 }
10188
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010189 /* There are at least two things to join, or else we have a subclass
10190 * of str in the sequence.
10191 * Do a pre-pass to figure out the total amount of space we'll
10192 * need (sz), and see whether all argument are strings.
10193 */
10194 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010195#ifdef Py_DEBUG
10196 use_memcpy = 0;
10197#else
10198 use_memcpy = 1;
10199#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010200 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010201 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010202 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010203 if (!PyUnicode_Check(item)) {
10204 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010205 "sequence item %zd: expected str instance,"
10206 " %.80s found",
10207 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010208 goto onError;
10209 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 if (PyUnicode_READY(item) == -1)
10211 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010212 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010213 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010214 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010215 if (i != 0) {
10216 add_sz += seplen;
10217 }
10218 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010219 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010220 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010221 goto onError;
10222 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010223 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010224 if (use_memcpy && last_obj != NULL) {
10225 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10226 use_memcpy = 0;
10227 }
10228 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010229 }
Tim Petersced69f82003-09-16 20:30:58 +000010230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010232 if (res == NULL)
10233 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010234
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010235 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010236#ifdef Py_DEBUG
10237 use_memcpy = 0;
10238#else
10239 if (use_memcpy) {
10240 res_data = PyUnicode_1BYTE_DATA(res);
10241 kind = PyUnicode_KIND(res);
10242 if (seplen != 0)
10243 sep_data = PyUnicode_1BYTE_DATA(sep);
10244 }
10245#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010246 if (use_memcpy) {
10247 for (i = 0; i < seqlen; ++i) {
10248 Py_ssize_t itemlen;
10249 item = items[i];
10250
10251 /* Copy item, and maybe the separator. */
10252 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010253 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010254 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010255 kind * seplen);
10256 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010257 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010258
10259 itemlen = PyUnicode_GET_LENGTH(item);
10260 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010261 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010262 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010263 kind * itemlen);
10264 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010265 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010266 }
10267 assert(res_data == PyUnicode_1BYTE_DATA(res)
10268 + kind * PyUnicode_GET_LENGTH(res));
10269 }
10270 else {
10271 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10272 Py_ssize_t itemlen;
10273 item = items[i];
10274
10275 /* Copy item, and maybe the separator. */
10276 if (i && seplen != 0) {
10277 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10278 res_offset += seplen;
10279 }
10280
10281 itemlen = PyUnicode_GET_LENGTH(item);
10282 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010283 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010284 res_offset += itemlen;
10285 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010286 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010287 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010288 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010291 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010293
Benjamin Peterson29060642009-01-31 22:14:21 +000010294 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010295 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010296 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010297 return NULL;
10298}
10299
Victor Stinnerd3f08822012-05-29 12:57:52 +020010300void
10301_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10302 Py_UCS4 fill_char)
10303{
10304 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010305 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010306 assert(PyUnicode_IS_READY(unicode));
10307 assert(unicode_modifiable(unicode));
10308 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10309 assert(start >= 0);
10310 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010311 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010312}
10313
Victor Stinner3fe55312012-01-04 00:33:50 +010010314Py_ssize_t
10315PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10316 Py_UCS4 fill_char)
10317{
10318 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010319
10320 if (!PyUnicode_Check(unicode)) {
10321 PyErr_BadInternalCall();
10322 return -1;
10323 }
10324 if (PyUnicode_READY(unicode) == -1)
10325 return -1;
10326 if (unicode_check_modifiable(unicode))
10327 return -1;
10328
Victor Stinnerd3f08822012-05-29 12:57:52 +020010329 if (start < 0) {
10330 PyErr_SetString(PyExc_IndexError, "string index out of range");
10331 return -1;
10332 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010333 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10334 PyErr_SetString(PyExc_ValueError,
10335 "fill character is bigger than "
10336 "the string maximum character");
10337 return -1;
10338 }
10339
10340 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10341 length = Py_MIN(maxlen, length);
10342 if (length <= 0)
10343 return 0;
10344
Victor Stinnerd3f08822012-05-29 12:57:52 +020010345 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010346 return length;
10347}
10348
Victor Stinner9310abb2011-10-05 00:59:23 +020010349static PyObject *
10350pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010351 Py_ssize_t left,
10352 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010354{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 PyObject *u;
10356 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010357 int kind;
10358 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010359
10360 if (left < 0)
10361 left = 0;
10362 if (right < 0)
10363 right = 0;
10364
Victor Stinnerc4b49542011-12-11 22:44:26 +010010365 if (left == 0 && right == 0)
10366 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010367
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10369 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010370 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10371 return NULL;
10372 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010373 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010374 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010376 if (!u)
10377 return NULL;
10378
10379 kind = PyUnicode_KIND(u);
10380 data = PyUnicode_DATA(u);
10381 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010382 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010383 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010384 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010385 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010386 assert(_PyUnicode_CheckConsistency(u, 1));
10387 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388}
10389
Alexander Belopolsky40018472011-02-26 01:02:56 +000010390PyObject *
10391PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010393 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010394
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010395 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010397
Benjamin Petersonead6b532011-12-20 17:23:42 -060010398 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010399 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010400 if (PyUnicode_IS_ASCII(string))
10401 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010402 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010403 PyUnicode_GET_LENGTH(string), keepends);
10404 else
10405 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010406 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010407 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 break;
10409 case PyUnicode_2BYTE_KIND:
10410 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010411 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412 PyUnicode_GET_LENGTH(string), keepends);
10413 break;
10414 case PyUnicode_4BYTE_KIND:
10415 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010416 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417 PyUnicode_GET_LENGTH(string), keepends);
10418 break;
10419 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010420 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010421 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010422 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010423}
10424
Alexander Belopolsky40018472011-02-26 01:02:56 +000010425static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010426split(PyObject *self,
10427 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010428 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010429{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010430 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010431 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432 Py_ssize_t len1, len2;
10433 PyObject* out;
10434
Guido van Rossumd57fd912000-03-10 22:53:23 +000010435 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010436 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010437
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010438 if (PyUnicode_READY(self) == -1)
10439 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010440
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010442 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010443 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010444 if (PyUnicode_IS_ASCII(self))
10445 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010446 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010447 PyUnicode_GET_LENGTH(self), maxcount
10448 );
10449 else
10450 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010451 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010452 PyUnicode_GET_LENGTH(self), maxcount
10453 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 case PyUnicode_2BYTE_KIND:
10455 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010456 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457 PyUnicode_GET_LENGTH(self), maxcount
10458 );
10459 case PyUnicode_4BYTE_KIND:
10460 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010461 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 PyUnicode_GET_LENGTH(self), maxcount
10463 );
10464 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010465 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 }
10467
10468 if (PyUnicode_READY(substring) == -1)
10469 return NULL;
10470
10471 kind1 = PyUnicode_KIND(self);
10472 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473 len1 = PyUnicode_GET_LENGTH(self);
10474 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010475 if (kind1 < kind2 || len1 < len2) {
10476 out = PyList_New(1);
10477 if (out == NULL)
10478 return NULL;
10479 Py_INCREF(self);
10480 PyList_SET_ITEM(out, 0, self);
10481 return out;
10482 }
10483 buf1 = PyUnicode_DATA(self);
10484 buf2 = PyUnicode_DATA(substring);
10485 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010486 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010487 if (!buf2)
10488 return NULL;
10489 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010491 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010492 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010493 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10494 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010495 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010496 else
10497 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010498 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 break;
10500 case PyUnicode_2BYTE_KIND:
10501 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010502 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010503 break;
10504 case PyUnicode_4BYTE_KIND:
10505 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010506 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 break;
10508 default:
10509 out = NULL;
10510 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010511 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010512 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010513 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010515}
10516
Alexander Belopolsky40018472011-02-26 01:02:56 +000010517static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010518rsplit(PyObject *self,
10519 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010520 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010521{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010522 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010523 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 Py_ssize_t len1, len2;
10525 PyObject* out;
10526
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010527 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010528 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530 if (PyUnicode_READY(self) == -1)
10531 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010534 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010536 if (PyUnicode_IS_ASCII(self))
10537 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010538 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010539 PyUnicode_GET_LENGTH(self), maxcount
10540 );
10541 else
10542 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010543 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010544 PyUnicode_GET_LENGTH(self), maxcount
10545 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 case PyUnicode_2BYTE_KIND:
10547 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010548 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010549 PyUnicode_GET_LENGTH(self), maxcount
10550 );
10551 case PyUnicode_4BYTE_KIND:
10552 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010553 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010554 PyUnicode_GET_LENGTH(self), maxcount
10555 );
10556 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010557 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 }
10559
10560 if (PyUnicode_READY(substring) == -1)
10561 return NULL;
10562
10563 kind1 = PyUnicode_KIND(self);
10564 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 len1 = PyUnicode_GET_LENGTH(self);
10566 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010567 if (kind1 < kind2 || len1 < len2) {
10568 out = PyList_New(1);
10569 if (out == NULL)
10570 return NULL;
10571 Py_INCREF(self);
10572 PyList_SET_ITEM(out, 0, self);
10573 return out;
10574 }
10575 buf1 = PyUnicode_DATA(self);
10576 buf2 = PyUnicode_DATA(substring);
10577 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010578 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010579 if (!buf2)
10580 return NULL;
10581 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010583 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010585 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10586 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010587 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010588 else
10589 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010590 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 break;
10592 case PyUnicode_2BYTE_KIND:
10593 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010594 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 break;
10596 case PyUnicode_4BYTE_KIND:
10597 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010598 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 break;
10600 default:
10601 out = NULL;
10602 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010603 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010604 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010605 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 return out;
10607}
10608
10609static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010610anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10611 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010613 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010615 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10616 return asciilib_find(buf1, len1, buf2, len2, offset);
10617 else
10618 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 case PyUnicode_2BYTE_KIND:
10620 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10621 case PyUnicode_4BYTE_KIND:
10622 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10623 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010624 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625}
10626
10627static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010628anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10629 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010630{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010631 switch (kind) {
10632 case PyUnicode_1BYTE_KIND:
10633 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10634 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10635 else
10636 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10637 case PyUnicode_2BYTE_KIND:
10638 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10639 case PyUnicode_4BYTE_KIND:
10640 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10641 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010642 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010643}
10644
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010645static void
10646replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10647 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10648{
10649 int kind = PyUnicode_KIND(u);
10650 void *data = PyUnicode_DATA(u);
10651 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10652 if (kind == PyUnicode_1BYTE_KIND) {
10653 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10654 (Py_UCS1 *)data + len,
10655 u1, u2, maxcount);
10656 }
10657 else if (kind == PyUnicode_2BYTE_KIND) {
10658 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10659 (Py_UCS2 *)data + len,
10660 u1, u2, maxcount);
10661 }
10662 else {
10663 assert(kind == PyUnicode_4BYTE_KIND);
10664 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10665 (Py_UCS4 *)data + len,
10666 u1, u2, maxcount);
10667 }
10668}
10669
Alexander Belopolsky40018472011-02-26 01:02:56 +000010670static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671replace(PyObject *self, PyObject *str1,
10672 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010673{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010675 const char *sbuf = PyUnicode_DATA(self);
10676 const void *buf1 = PyUnicode_DATA(str1);
10677 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 int srelease = 0, release1 = 0, release2 = 0;
10679 int skind = PyUnicode_KIND(self);
10680 int kind1 = PyUnicode_KIND(str1);
10681 int kind2 = PyUnicode_KIND(str2);
10682 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10683 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10684 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010685 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010686 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010687
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010688 if (slen < len1)
10689 goto nothing;
10690
Guido van Rossumd57fd912000-03-10 22:53:23 +000010691 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010692 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010693 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010694 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010695
Victor Stinner59de0ee2011-10-07 10:01:28 +020010696 if (str1 == str2)
10697 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010698
Victor Stinner49a0a212011-10-12 23:46:10 +020010699 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010700 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10701 if (maxchar < maxchar_str1)
10702 /* substring too wide to be present */
10703 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010704 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10705 /* Replacing str1 with str2 may cause a maxchar reduction in the
10706 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010707 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010708 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010709
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010710 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010711 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010712 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010713 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010714 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010715 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010716 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010717 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010718
Victor Stinner69ed0f42013-04-09 21:48:24 +020010719 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010720 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010721 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010722 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010723 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010724 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010725 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010727
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010728 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10729 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010730 }
10731 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010732 int rkind = skind;
10733 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010734 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010736 if (kind1 < rkind) {
10737 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010738 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010739 if (!buf1) goto error;
10740 release1 = 1;
10741 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010742 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010743 if (i < 0)
10744 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010745 if (rkind > kind2) {
10746 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010747 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010748 if (!buf2) goto error;
10749 release2 = 1;
10750 }
10751 else if (rkind < kind2) {
10752 /* widen self and buf1 */
10753 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010754 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010755 assert(buf1 != PyUnicode_DATA(str1));
10756 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010757 buf1 = PyUnicode_DATA(str1);
10758 release1 = 0;
10759 }
10760 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010761 if (!sbuf) goto error;
10762 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010763 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010764 if (!buf1) goto error;
10765 release1 = 1;
10766 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010767 u = PyUnicode_New(slen, maxchar);
10768 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010769 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010770 assert(PyUnicode_KIND(u) == rkind);
10771 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010772
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010773 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010774 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010775 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010777 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010778 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010779
10780 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010781 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010782 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010783 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010784 if (i == -1)
10785 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010786 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010787 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010788 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010789 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010790 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010791 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010792 }
10793 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010795 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010796 int rkind = skind;
10797 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010800 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010801 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010802 if (!buf1) goto error;
10803 release1 = 1;
10804 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010805 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010806 if (n == 0)
10807 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010808 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010809 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010810 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010811 if (!buf2) goto error;
10812 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010813 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010814 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010815 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010816 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010817 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010818 if (!sbuf) goto error;
10819 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010820 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010821 assert(buf1 != PyUnicode_DATA(str1));
10822 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010823 buf1 = PyUnicode_DATA(str1);
10824 release1 = 0;
10825 }
10826 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010827 if (!buf1) goto error;
10828 release1 = 1;
10829 }
10830 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10831 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010832 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010833 PyErr_SetString(PyExc_OverflowError,
10834 "replace string is too long");
10835 goto error;
10836 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010837 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010838 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010839 _Py_INCREF_UNICODE_EMPTY();
10840 if (!unicode_empty)
10841 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010842 u = unicode_empty;
10843 goto done;
10844 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010845 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010846 PyErr_SetString(PyExc_OverflowError,
10847 "replace string is too long");
10848 goto error;
10849 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010850 u = PyUnicode_New(new_size, maxchar);
10851 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010852 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010853 assert(PyUnicode_KIND(u) == rkind);
10854 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010855 ires = i = 0;
10856 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010857 while (n-- > 0) {
10858 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010859 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010860 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010861 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010862 if (j == -1)
10863 break;
10864 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010865 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010866 memcpy(res + rkind * ires,
10867 sbuf + rkind * i,
10868 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010869 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010870 }
10871 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010872 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010873 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010874 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010875 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010876 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010877 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010878 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010879 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010880 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010881 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010882 memcpy(res + rkind * ires,
10883 sbuf + rkind * i,
10884 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010885 }
10886 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010887 /* interleave */
10888 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010889 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010890 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010891 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010892 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010893 if (--n <= 0)
10894 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010895 memcpy(res + rkind * ires,
10896 sbuf + rkind * i,
10897 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010898 ires++;
10899 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010900 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010901 memcpy(res + rkind * ires,
10902 sbuf + rkind * i,
10903 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010904 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010905 }
10906
10907 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010908 unicode_adjust_maxchar(&u);
10909 if (u == NULL)
10910 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010911 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010912
10913 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010914 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10915 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10916 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010917 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010918 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010919 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010920 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010921 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010922 PyMem_FREE((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010923 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010924 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010925
Benjamin Peterson29060642009-01-31 22:14:21 +000010926 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010927 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010928 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10929 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10930 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010931 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010932 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010933 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010934 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010935 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010936 PyMem_FREE((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010937 return unicode_result_unchanged(self);
10938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010939 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010940 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10941 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10942 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10943 if (srelease)
10944 PyMem_FREE((void *)sbuf);
10945 if (release1)
10946 PyMem_FREE((void *)buf1);
10947 if (release2)
10948 PyMem_FREE((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010949 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010950}
10951
10952/* --- Unicode Object Methods --------------------------------------------- */
10953
INADA Naoki3ae20562017-01-16 20:41:20 +090010954/*[clinic input]
10955str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956
INADA Naoki3ae20562017-01-16 20:41:20 +090010957Return a version of the string where each word is titlecased.
10958
10959More specifically, words start with uppercased characters and all remaining
10960cased characters have lower case.
10961[clinic start generated code]*/
10962
10963static PyObject *
10964unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010965/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010967 if (PyUnicode_READY(self) == -1)
10968 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010969 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010970}
10971
INADA Naoki3ae20562017-01-16 20:41:20 +090010972/*[clinic input]
10973str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010974
INADA Naoki3ae20562017-01-16 20:41:20 +090010975Return a capitalized version of the string.
10976
10977More specifically, make the first character have upper case and the rest lower
10978case.
10979[clinic start generated code]*/
10980
10981static PyObject *
10982unicode_capitalize_impl(PyObject *self)
10983/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010984{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010985 if (PyUnicode_READY(self) == -1)
10986 return NULL;
10987 if (PyUnicode_GET_LENGTH(self) == 0)
10988 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010989 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990}
10991
INADA Naoki3ae20562017-01-16 20:41:20 +090010992/*[clinic input]
10993str.casefold as unicode_casefold
10994
10995Return a version of the string suitable for caseless comparisons.
10996[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010997
10998static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010999unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011000/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011001{
11002 if (PyUnicode_READY(self) == -1)
11003 return NULL;
11004 if (PyUnicode_IS_ASCII(self))
11005 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011006 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050011007}
11008
11009
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011010/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011011
11012static int
11013convert_uc(PyObject *obj, void *addr)
11014{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011015 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011016
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011017 if (!PyUnicode_Check(obj)) {
11018 PyErr_Format(PyExc_TypeError,
11019 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020011020 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011021 return 0;
11022 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011023 if (PyUnicode_READY(obj) < 0)
11024 return 0;
11025 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011026 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011027 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000011028 return 0;
11029 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011030 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011031 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011032}
11033
INADA Naoki3ae20562017-01-16 20:41:20 +090011034/*[clinic input]
11035str.center as unicode_center
11036
11037 width: Py_ssize_t
11038 fillchar: Py_UCS4 = ' '
11039 /
11040
11041Return a centered string of length width.
11042
11043Padding is done using the specified fill character (default is a space).
11044[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045
11046static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011047unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11048/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011049{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011050 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011051
Benjamin Petersonbac79492012-01-14 13:34:47 -050011052 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011053 return NULL;
11054
Victor Stinnerc4b49542011-12-11 22:44:26 +010011055 if (PyUnicode_GET_LENGTH(self) >= width)
11056 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057
Victor Stinnerc4b49542011-12-11 22:44:26 +010011058 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059 left = marg / 2 + (marg & width & 1);
11060
Victor Stinner9310abb2011-10-05 00:59:23 +020011061 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062}
11063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011064/* This function assumes that str1 and str2 are readied by the caller. */
11065
Marc-André Lemburge5034372000-08-08 08:04:29 +000011066static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011067unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011068{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011069#define COMPARE(TYPE1, TYPE2) \
11070 do { \
11071 TYPE1* p1 = (TYPE1 *)data1; \
11072 TYPE2* p2 = (TYPE2 *)data2; \
11073 TYPE1* end = p1 + len; \
11074 Py_UCS4 c1, c2; \
11075 for (; p1 != end; p1++, p2++) { \
11076 c1 = *p1; \
11077 c2 = *p2; \
11078 if (c1 != c2) \
11079 return (c1 < c2) ? -1 : 1; \
11080 } \
11081 } \
11082 while (0)
11083
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011084 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011085 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011086 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011087
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011088 kind1 = PyUnicode_KIND(str1);
11089 kind2 = PyUnicode_KIND(str2);
11090 data1 = PyUnicode_DATA(str1);
11091 data2 = PyUnicode_DATA(str2);
11092 len1 = PyUnicode_GET_LENGTH(str1);
11093 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011094 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011095
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011096 switch(kind1) {
11097 case PyUnicode_1BYTE_KIND:
11098 {
11099 switch(kind2) {
11100 case PyUnicode_1BYTE_KIND:
11101 {
11102 int cmp = memcmp(data1, data2, len);
11103 /* normalize result of memcmp() into the range [-1; 1] */
11104 if (cmp < 0)
11105 return -1;
11106 if (cmp > 0)
11107 return 1;
11108 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011109 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011110 case PyUnicode_2BYTE_KIND:
11111 COMPARE(Py_UCS1, Py_UCS2);
11112 break;
11113 case PyUnicode_4BYTE_KIND:
11114 COMPARE(Py_UCS1, Py_UCS4);
11115 break;
11116 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011117 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011118 }
11119 break;
11120 }
11121 case PyUnicode_2BYTE_KIND:
11122 {
11123 switch(kind2) {
11124 case PyUnicode_1BYTE_KIND:
11125 COMPARE(Py_UCS2, Py_UCS1);
11126 break;
11127 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011128 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011129 COMPARE(Py_UCS2, Py_UCS2);
11130 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011131 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011132 case PyUnicode_4BYTE_KIND:
11133 COMPARE(Py_UCS2, Py_UCS4);
11134 break;
11135 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011136 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011137 }
11138 break;
11139 }
11140 case PyUnicode_4BYTE_KIND:
11141 {
11142 switch(kind2) {
11143 case PyUnicode_1BYTE_KIND:
11144 COMPARE(Py_UCS4, Py_UCS1);
11145 break;
11146 case PyUnicode_2BYTE_KIND:
11147 COMPARE(Py_UCS4, Py_UCS2);
11148 break;
11149 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011150 {
11151#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11152 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11153 /* normalize result of wmemcmp() into the range [-1; 1] */
11154 if (cmp < 0)
11155 return -1;
11156 if (cmp > 0)
11157 return 1;
11158#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011159 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011160#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011161 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011162 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011163 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011164 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011165 }
11166 break;
11167 }
11168 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011169 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011170 }
11171
Victor Stinner770e19e2012-10-04 22:59:45 +020011172 if (len1 == len2)
11173 return 0;
11174 if (len1 < len2)
11175 return -1;
11176 else
11177 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011178
11179#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011180}
11181
Benjamin Peterson621b4302016-09-09 13:54:34 -070011182static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011183unicode_compare_eq(PyObject *str1, PyObject *str2)
11184{
11185 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011186 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011187 Py_ssize_t len;
11188 int cmp;
11189
Victor Stinnere5567ad2012-10-23 02:48:49 +020011190 len = PyUnicode_GET_LENGTH(str1);
11191 if (PyUnicode_GET_LENGTH(str2) != len)
11192 return 0;
11193 kind = PyUnicode_KIND(str1);
11194 if (PyUnicode_KIND(str2) != kind)
11195 return 0;
11196 data1 = PyUnicode_DATA(str1);
11197 data2 = PyUnicode_DATA(str2);
11198
11199 cmp = memcmp(data1, data2, len * kind);
11200 return (cmp == 0);
11201}
11202
11203
Alexander Belopolsky40018472011-02-26 01:02:56 +000011204int
11205PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011207 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11208 if (PyUnicode_READY(left) == -1 ||
11209 PyUnicode_READY(right) == -1)
11210 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011211
11212 /* a string is equal to itself */
11213 if (left == right)
11214 return 0;
11215
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011216 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011217 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011218 PyErr_Format(PyExc_TypeError,
11219 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011220 Py_TYPE(left)->tp_name,
11221 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011222 return -1;
11223}
11224
Martin v. Löwis5b222132007-06-10 09:51:05 +000011225int
11226PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11227{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011228 Py_ssize_t i;
11229 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011230 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011231 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011232
Victor Stinner910337b2011-10-03 03:20:16 +020011233 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011234 if (!PyUnicode_IS_READY(uni)) {
11235 const wchar_t *ws = _PyUnicode_WSTR(uni);
11236 /* Compare Unicode string and source character set string */
11237 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11238 if (chr != ustr[i])
11239 return (chr < ustr[i]) ? -1 : 1;
11240 }
11241 /* This check keeps Python strings that end in '\0' from comparing equal
11242 to C strings identical up to that point. */
11243 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11244 return 1; /* uni is longer */
11245 if (ustr[i])
11246 return -1; /* str is longer */
11247 return 0;
11248 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011249 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011250 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011251 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011252 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011253 size_t len, len2 = strlen(str);
11254 int cmp;
11255
11256 len = Py_MIN(len1, len2);
11257 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011258 if (cmp != 0) {
11259 if (cmp < 0)
11260 return -1;
11261 else
11262 return 1;
11263 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011264 if (len1 > len2)
11265 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011266 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011267 return -1; /* str is longer */
11268 return 0;
11269 }
11270 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011271 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011272 /* Compare Unicode string and source character set string */
11273 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011274 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011275 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11276 /* This check keeps Python strings that end in '\0' from comparing equal
11277 to C strings identical up to that point. */
11278 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11279 return 1; /* uni is longer */
11280 if (str[i])
11281 return -1; /* str is longer */
11282 return 0;
11283 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011284}
11285
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011286static int
11287non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11288{
11289 size_t i, len;
11290 const wchar_t *p;
11291 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11292 if (strlen(str) != len)
11293 return 0;
11294 p = _PyUnicode_WSTR(unicode);
11295 assert(p);
11296 for (i = 0; i < len; i++) {
11297 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011298 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011299 return 0;
11300 }
11301 return 1;
11302}
11303
11304int
11305_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11306{
11307 size_t len;
11308 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011309 assert(str);
11310#ifndef NDEBUG
11311 for (const char *p = str; *p; p++) {
11312 assert((unsigned char)*p < 128);
11313 }
11314#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011315 if (PyUnicode_READY(unicode) == -1) {
11316 /* Memory error or bad data */
11317 PyErr_Clear();
11318 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11319 }
11320 if (!PyUnicode_IS_ASCII(unicode))
11321 return 0;
11322 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11323 return strlen(str) == len &&
11324 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11325}
11326
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011327int
11328_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11329{
11330 PyObject *right_uni;
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011331
11332 assert(_PyUnicode_CHECK(left));
11333 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011334#ifndef NDEBUG
11335 for (const char *p = right->string; *p; p++) {
11336 assert((unsigned char)*p < 128);
11337 }
11338#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011339
11340 if (PyUnicode_READY(left) == -1) {
11341 /* memory error or bad data */
11342 PyErr_Clear();
11343 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11344 }
11345
11346 if (!PyUnicode_IS_ASCII(left))
11347 return 0;
11348
11349 right_uni = _PyUnicode_FromId(right); /* borrowed */
11350 if (right_uni == NULL) {
11351 /* memory error or bad data */
11352 PyErr_Clear();
11353 return _PyUnicode_EqualToASCIIString(left, right->string);
11354 }
11355
11356 if (left == right_uni)
11357 return 1;
11358
11359 if (PyUnicode_CHECK_INTERNED(left))
11360 return 0;
11361
Victor Stinner607b1022020-05-05 18:50:30 +020011362#ifdef INTERNED_STRINGS
INADA Naoki7cc95f52018-01-28 02:07:09 +090011363 assert(_PyUnicode_HASH(right_uni) != -1);
Victor Stinner607b1022020-05-05 18:50:30 +020011364 Py_hash_t hash = _PyUnicode_HASH(left);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011365 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11366 return 0;
Victor Stinner607b1022020-05-05 18:50:30 +020011367#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011368
11369 return unicode_compare_eq(left, right_uni);
11370}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011371
Alexander Belopolsky40018472011-02-26 01:02:56 +000011372PyObject *
11373PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011374{
11375 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011376
Victor Stinnere5567ad2012-10-23 02:48:49 +020011377 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11378 Py_RETURN_NOTIMPLEMENTED;
11379
11380 if (PyUnicode_READY(left) == -1 ||
11381 PyUnicode_READY(right) == -1)
11382 return NULL;
11383
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011384 if (left == right) {
11385 switch (op) {
11386 case Py_EQ:
11387 case Py_LE:
11388 case Py_GE:
11389 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011390 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011391 case Py_NE:
11392 case Py_LT:
11393 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011394 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011395 default:
11396 PyErr_BadArgument();
11397 return NULL;
11398 }
11399 }
11400 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011401 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011402 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011403 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011404 }
11405 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011406 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011407 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011408 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011409}
11410
Alexander Belopolsky40018472011-02-26 01:02:56 +000011411int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011412_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11413{
11414 return unicode_eq(aa, bb);
11415}
11416
11417int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011418PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011419{
Victor Stinner77282cb2013-04-14 19:22:47 +020011420 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011421 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011422 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011423 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011424
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011425 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011426 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011427 "'in <string>' requires string as left operand, not %.100s",
11428 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011429 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011430 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011431 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011432 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011433 if (ensure_unicode(str) < 0)
11434 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011436 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011437 kind2 = PyUnicode_KIND(substr);
11438 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011439 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011440 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011441 len2 = PyUnicode_GET_LENGTH(substr);
11442 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011443 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011444 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011445 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011446 if (len2 == 1) {
11447 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11448 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011449 return result;
11450 }
11451 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011452 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011453 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011454 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011455 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011456
Victor Stinner77282cb2013-04-14 19:22:47 +020011457 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 case PyUnicode_1BYTE_KIND:
11459 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11460 break;
11461 case PyUnicode_2BYTE_KIND:
11462 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11463 break;
11464 case PyUnicode_4BYTE_KIND:
11465 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11466 break;
11467 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011468 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011469 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011470
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011471 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011472 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011473 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011474
Guido van Rossum403d68b2000-03-13 15:55:09 +000011475 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011476}
11477
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478/* Concat to string or Unicode object giving a new Unicode object. */
11479
Alexander Belopolsky40018472011-02-26 01:02:56 +000011480PyObject *
11481PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011483 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011484 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011485 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011487 if (ensure_unicode(left) < 0)
11488 return NULL;
11489
11490 if (!PyUnicode_Check(right)) {
11491 PyErr_Format(PyExc_TypeError,
11492 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011493 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011494 return NULL;
11495 }
11496 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011497 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498
11499 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011500 if (left == unicode_empty)
11501 return PyUnicode_FromObject(right);
11502 if (right == unicode_empty)
11503 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011505 left_len = PyUnicode_GET_LENGTH(left);
11506 right_len = PyUnicode_GET_LENGTH(right);
11507 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011508 PyErr_SetString(PyExc_OverflowError,
11509 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011510 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011511 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011512 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011513
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011514 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11515 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011516 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011519 result = PyUnicode_New(new_len, maxchar);
11520 if (result == NULL)
11521 return NULL;
11522 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11523 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11524 assert(_PyUnicode_CheckConsistency(result, 1));
11525 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526}
11527
Walter Dörwald1ab83302007-05-18 17:15:44 +000011528void
Victor Stinner23e56682011-10-03 03:54:37 +020011529PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011530{
Victor Stinner23e56682011-10-03 03:54:37 +020011531 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011532 Py_UCS4 maxchar, maxchar2;
11533 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011534
11535 if (p_left == NULL) {
11536 if (!PyErr_Occurred())
11537 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011538 return;
11539 }
Victor Stinner23e56682011-10-03 03:54:37 +020011540 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011541 if (right == NULL || left == NULL
11542 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011543 if (!PyErr_Occurred())
11544 PyErr_BadInternalCall();
11545 goto error;
11546 }
11547
Benjamin Petersonbac79492012-01-14 13:34:47 -050011548 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011549 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011550 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011551 goto error;
11552
Victor Stinner488fa492011-12-12 00:01:39 +010011553 /* Shortcuts */
11554 if (left == unicode_empty) {
11555 Py_DECREF(left);
11556 Py_INCREF(right);
11557 *p_left = right;
11558 return;
11559 }
11560 if (right == unicode_empty)
11561 return;
11562
11563 left_len = PyUnicode_GET_LENGTH(left);
11564 right_len = PyUnicode_GET_LENGTH(right);
11565 if (left_len > PY_SSIZE_T_MAX - right_len) {
11566 PyErr_SetString(PyExc_OverflowError,
11567 "strings are too large to concat");
11568 goto error;
11569 }
11570 new_len = left_len + right_len;
11571
11572 if (unicode_modifiable(left)
11573 && PyUnicode_CheckExact(right)
11574 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011575 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11576 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011577 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011578 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011579 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11580 {
11581 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011582 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011583 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011584
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011585 /* copy 'right' into the newly allocated area of 'left' */
11586 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011587 }
Victor Stinner488fa492011-12-12 00:01:39 +010011588 else {
11589 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11590 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011591 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011592
Victor Stinner488fa492011-12-12 00:01:39 +010011593 /* Concat the two Unicode strings */
11594 res = PyUnicode_New(new_len, maxchar);
11595 if (res == NULL)
11596 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011597 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11598 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011599 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011600 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011601 }
11602 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011603 return;
11604
11605error:
Victor Stinner488fa492011-12-12 00:01:39 +010011606 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011607}
11608
11609void
11610PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11611{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011612 PyUnicode_Append(pleft, right);
11613 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011614}
11615
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011616/*
11617Wraps stringlib_parse_args_finds() and additionally ensures that the
11618first argument is a unicode object.
11619*/
11620
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011621static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011622parse_args_finds_unicode(const char * function_name, PyObject *args,
11623 PyObject **substring,
11624 Py_ssize_t *start, Py_ssize_t *end)
11625{
11626 if(stringlib_parse_args_finds(function_name, args, substring,
11627 start, end)) {
11628 if (ensure_unicode(*substring) < 0)
11629 return 0;
11630 return 1;
11631 }
11632 return 0;
11633}
11634
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011635PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011636 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011638Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011639string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011640interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011641
11642static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011643unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011644{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011645 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011646 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011647 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011648 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011649 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011650 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011651 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011653 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011654 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011655
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011656 kind1 = PyUnicode_KIND(self);
11657 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011658 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011659 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011661 len1 = PyUnicode_GET_LENGTH(self);
11662 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011663 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011664 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011665 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011666
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011667 buf1 = PyUnicode_DATA(self);
11668 buf2 = PyUnicode_DATA(substring);
11669 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011670 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011671 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011672 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011673 }
11674 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011675 case PyUnicode_1BYTE_KIND:
11676 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011677 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011678 buf2, len2, PY_SSIZE_T_MAX
11679 );
11680 break;
11681 case PyUnicode_2BYTE_KIND:
11682 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011683 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011684 buf2, len2, PY_SSIZE_T_MAX
11685 );
11686 break;
11687 case PyUnicode_4BYTE_KIND:
11688 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011689 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011690 buf2, len2, PY_SSIZE_T_MAX
11691 );
11692 break;
11693 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011694 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011695 }
11696
11697 result = PyLong_FromSsize_t(iresult);
11698
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011699 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011700 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011701 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703 return result;
11704}
11705
INADA Naoki3ae20562017-01-16 20:41:20 +090011706/*[clinic input]
11707str.encode as unicode_encode
11708
11709 encoding: str(c_default="NULL") = 'utf-8'
11710 The encoding in which to encode the string.
11711 errors: str(c_default="NULL") = 'strict'
11712 The error handling scheme to use for encoding errors.
11713 The default is 'strict' meaning that encoding errors raise a
11714 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11715 'xmlcharrefreplace' as well as any other name registered with
11716 codecs.register_error that can handle UnicodeEncodeErrors.
11717
11718Encode the string using the codec registered for encoding.
11719[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720
11721static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011722unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011723/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011725 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011726}
11727
INADA Naoki3ae20562017-01-16 20:41:20 +090011728/*[clinic input]
11729str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011730
INADA Naoki3ae20562017-01-16 20:41:20 +090011731 tabsize: int = 8
11732
11733Return a copy where all tab characters are expanded using spaces.
11734
11735If tabsize is not given, a tab size of 8 characters is assumed.
11736[clinic start generated code]*/
11737
11738static PyObject *
11739unicode_expandtabs_impl(PyObject *self, int tabsize)
11740/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011742 Py_ssize_t i, j, line_pos, src_len, incr;
11743 Py_UCS4 ch;
11744 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011745 const void *src_data;
11746 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011747 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011748 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749
Antoine Pitrou22425222011-10-04 19:10:51 +020011750 if (PyUnicode_READY(self) == -1)
11751 return NULL;
11752
Thomas Wouters7e474022000-07-16 12:04:32 +000011753 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011754 src_len = PyUnicode_GET_LENGTH(self);
11755 i = j = line_pos = 0;
11756 kind = PyUnicode_KIND(self);
11757 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011758 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011759 for (; i < src_len; i++) {
11760 ch = PyUnicode_READ(kind, src_data, i);
11761 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011762 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011763 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011764 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011765 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011766 goto overflow;
11767 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011768 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011769 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011770 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011772 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011773 goto overflow;
11774 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011776 if (ch == '\n' || ch == '\r')
11777 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011779 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011780 if (!found)
11781 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011782
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011784 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785 if (!u)
11786 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011787 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788
Antoine Pitroue71d5742011-10-04 15:55:09 +020011789 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790
Antoine Pitroue71d5742011-10-04 15:55:09 +020011791 for (; i < src_len; i++) {
11792 ch = PyUnicode_READ(kind, src_data, i);
11793 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011794 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011795 incr = tabsize - (line_pos % tabsize);
11796 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011797 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011798 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011799 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011800 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011801 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011802 line_pos++;
11803 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011804 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011805 if (ch == '\n' || ch == '\r')
11806 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011807 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011808 }
11809 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011810 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011811
Antoine Pitroue71d5742011-10-04 15:55:09 +020011812 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011813 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11814 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815}
11816
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011817PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011818 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819\n\
11820Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011821such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822arguments start and end are interpreted as in slice notation.\n\
11823\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011824Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825
11826static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011827unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011829 /* initialize variables to prevent gcc warning */
11830 PyObject *substring = NULL;
11831 Py_ssize_t start = 0;
11832 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011833 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011835 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011838 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011839 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011840
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011841 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011843 if (result == -2)
11844 return NULL;
11845
Christian Heimes217cfd12007-12-02 14:31:20 +000011846 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847}
11848
11849static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011850unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011851{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011852 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011853 enum PyUnicode_Kind kind;
11854 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011855
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011856 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011857 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011858 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011859 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011860 if (PyUnicode_READY(self) == -1) {
11861 return NULL;
11862 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011863 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11864 PyErr_SetString(PyExc_IndexError, "string index out of range");
11865 return NULL;
11866 }
11867 kind = PyUnicode_KIND(self);
11868 data = PyUnicode_DATA(self);
11869 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011870 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871}
11872
Guido van Rossumc2504932007-09-18 19:42:40 +000011873/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011874 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011875static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011876unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011877{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011878 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011879
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011880#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011881 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011882#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011883 if (_PyUnicode_HASH(self) != -1)
11884 return _PyUnicode_HASH(self);
11885 if (PyUnicode_READY(self) == -1)
11886 return -1;
animalizea1d14252019-01-02 20:16:06 +080011887
Christian Heimes985ecdc2013-11-20 11:46:18 +010011888 x = _Py_HashBytes(PyUnicode_DATA(self),
11889 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011890 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011891 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892}
11893
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011894PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011895 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896\n\
oldkaa0735f2018-02-02 16:52:55 +080011897Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011898such that sub is contained within S[start:end]. Optional\n\
11899arguments start and end are interpreted as in slice notation.\n\
11900\n\
11901Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902
11903static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011904unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011906 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011907 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011908 PyObject *substring = NULL;
11909 Py_ssize_t start = 0;
11910 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011912 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011915 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011916 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011917
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011918 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011920 if (result == -2)
11921 return NULL;
11922
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923 if (result < 0) {
11924 PyErr_SetString(PyExc_ValueError, "substring not found");
11925 return NULL;
11926 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011927
Christian Heimes217cfd12007-12-02 14:31:20 +000011928 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929}
11930
INADA Naoki3ae20562017-01-16 20:41:20 +090011931/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011932str.isascii as unicode_isascii
11933
11934Return True if all characters in the string are ASCII, False otherwise.
11935
11936ASCII characters have code points in the range U+0000-U+007F.
11937Empty string is ASCII too.
11938[clinic start generated code]*/
11939
11940static PyObject *
11941unicode_isascii_impl(PyObject *self)
11942/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11943{
11944 if (PyUnicode_READY(self) == -1) {
11945 return NULL;
11946 }
11947 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11948}
11949
11950/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011951str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011952
INADA Naoki3ae20562017-01-16 20:41:20 +090011953Return True if the string is a lowercase string, False otherwise.
11954
11955A string is lowercase if all cased characters in the string are lowercase and
11956there is at least one cased character in the string.
11957[clinic start generated code]*/
11958
11959static PyObject *
11960unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011961/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011963 Py_ssize_t i, length;
11964 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011965 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966 int cased;
11967
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011968 if (PyUnicode_READY(self) == -1)
11969 return NULL;
11970 length = PyUnicode_GET_LENGTH(self);
11971 kind = PyUnicode_KIND(self);
11972 data = PyUnicode_DATA(self);
11973
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011975 if (length == 1)
11976 return PyBool_FromLong(
11977 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011978
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011979 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011981 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011982
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 for (i = 0; i < length; i++) {
11985 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011986
Benjamin Peterson29060642009-01-31 22:14:21 +000011987 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011988 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011989 else if (!cased && Py_UNICODE_ISLOWER(ch))
11990 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011991 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011992 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011993}
11994
INADA Naoki3ae20562017-01-16 20:41:20 +090011995/*[clinic input]
11996str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997
INADA Naoki3ae20562017-01-16 20:41:20 +090011998Return True if the string is an uppercase string, False otherwise.
11999
12000A string is uppercase if all cased characters in the string are uppercase and
12001there is at least one cased character in the string.
12002[clinic start generated code]*/
12003
12004static PyObject *
12005unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012006/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 Py_ssize_t i, length;
12009 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012010 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011 int cased;
12012
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012013 if (PyUnicode_READY(self) == -1)
12014 return NULL;
12015 length = PyUnicode_GET_LENGTH(self);
12016 kind = PyUnicode_KIND(self);
12017 data = PyUnicode_DATA(self);
12018
Guido van Rossumd57fd912000-03-10 22:53:23 +000012019 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020 if (length == 1)
12021 return PyBool_FromLong(
12022 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012023
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012024 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012025 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012026 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012027
Guido van Rossumd57fd912000-03-10 22:53:23 +000012028 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012029 for (i = 0; i < length; i++) {
12030 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012031
Benjamin Peterson29060642009-01-31 22:14:21 +000012032 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012033 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012034 else if (!cased && Py_UNICODE_ISUPPER(ch))
12035 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012036 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012037 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012038}
12039
INADA Naoki3ae20562017-01-16 20:41:20 +090012040/*[clinic input]
12041str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000012042
INADA Naoki3ae20562017-01-16 20:41:20 +090012043Return True if the string is a title-cased string, False otherwise.
12044
12045In a title-cased string, upper- and title-case characters may only
12046follow uncased characters and lowercase characters only cased ones.
12047[clinic start generated code]*/
12048
12049static PyObject *
12050unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012051/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012053 Py_ssize_t i, length;
12054 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012055 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012056 int cased, previous_is_cased;
12057
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012058 if (PyUnicode_READY(self) == -1)
12059 return NULL;
12060 length = PyUnicode_GET_LENGTH(self);
12061 kind = PyUnicode_KIND(self);
12062 data = PyUnicode_DATA(self);
12063
Guido van Rossumd57fd912000-03-10 22:53:23 +000012064 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012065 if (length == 1) {
12066 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12067 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12068 (Py_UNICODE_ISUPPER(ch) != 0));
12069 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012071 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012072 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012073 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012074
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075 cased = 0;
12076 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012077 for (i = 0; i < length; i++) {
12078 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012079
Benjamin Peterson29060642009-01-31 22:14:21 +000012080 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12081 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012082 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012083 previous_is_cased = 1;
12084 cased = 1;
12085 }
12086 else if (Py_UNICODE_ISLOWER(ch)) {
12087 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012088 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012089 previous_is_cased = 1;
12090 cased = 1;
12091 }
12092 else
12093 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012095 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012096}
12097
INADA Naoki3ae20562017-01-16 20:41:20 +090012098/*[clinic input]
12099str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012100
INADA Naoki3ae20562017-01-16 20:41:20 +090012101Return True if the string is a whitespace string, False otherwise.
12102
12103A string is whitespace if all characters in the string are whitespace and there
12104is at least one character in the string.
12105[clinic start generated code]*/
12106
12107static PyObject *
12108unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012109/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012110{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012111 Py_ssize_t i, length;
12112 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012113 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012114
12115 if (PyUnicode_READY(self) == -1)
12116 return NULL;
12117 length = PyUnicode_GET_LENGTH(self);
12118 kind = PyUnicode_KIND(self);
12119 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012122 if (length == 1)
12123 return PyBool_FromLong(
12124 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012125
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012126 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012127 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012128 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012130 for (i = 0; i < length; i++) {
12131 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012132 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012133 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012134 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012135 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012136}
12137
INADA Naoki3ae20562017-01-16 20:41:20 +090012138/*[clinic input]
12139str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012140
INADA Naoki3ae20562017-01-16 20:41:20 +090012141Return True if the string is an alphabetic string, False otherwise.
12142
12143A string is alphabetic if all characters in the string are alphabetic and there
12144is at least one character in the string.
12145[clinic start generated code]*/
12146
12147static PyObject *
12148unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012149/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012150{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012151 Py_ssize_t i, length;
12152 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012153 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012154
12155 if (PyUnicode_READY(self) == -1)
12156 return NULL;
12157 length = PyUnicode_GET_LENGTH(self);
12158 kind = PyUnicode_KIND(self);
12159 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012160
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012161 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012162 if (length == 1)
12163 return PyBool_FromLong(
12164 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012165
12166 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012167 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012168 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012169
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012170 for (i = 0; i < length; i++) {
12171 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012172 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012173 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012174 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012175}
12176
INADA Naoki3ae20562017-01-16 20:41:20 +090012177/*[clinic input]
12178str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012179
INADA Naoki3ae20562017-01-16 20:41:20 +090012180Return True if the string is an alpha-numeric string, False otherwise.
12181
12182A string is alpha-numeric if all characters in the string are alpha-numeric and
12183there is at least one character in the string.
12184[clinic start generated code]*/
12185
12186static PyObject *
12187unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012188/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012189{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012190 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012191 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012192 Py_ssize_t len, i;
12193
12194 if (PyUnicode_READY(self) == -1)
12195 return NULL;
12196
12197 kind = PyUnicode_KIND(self);
12198 data = PyUnicode_DATA(self);
12199 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012200
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012201 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 if (len == 1) {
12203 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12204 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12205 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012206
12207 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012208 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012209 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012210
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012211 for (i = 0; i < len; i++) {
12212 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012213 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012214 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012215 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012216 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012217}
12218
INADA Naoki3ae20562017-01-16 20:41:20 +090012219/*[clinic input]
12220str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221
INADA Naoki3ae20562017-01-16 20:41:20 +090012222Return True if the string is a decimal string, False otherwise.
12223
12224A string is a decimal string if all characters in the string are decimal and
12225there is at least one character in the string.
12226[clinic start generated code]*/
12227
12228static PyObject *
12229unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012230/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012231{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012232 Py_ssize_t i, length;
12233 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012234 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012235
12236 if (PyUnicode_READY(self) == -1)
12237 return NULL;
12238 length = PyUnicode_GET_LENGTH(self);
12239 kind = PyUnicode_KIND(self);
12240 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012243 if (length == 1)
12244 return PyBool_FromLong(
12245 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012246
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012247 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012248 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012249 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012250
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012251 for (i = 0; i < length; i++) {
12252 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012253 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012254 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012255 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256}
12257
INADA Naoki3ae20562017-01-16 20:41:20 +090012258/*[clinic input]
12259str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012260
INADA Naoki3ae20562017-01-16 20:41:20 +090012261Return True if the string is a digit string, False otherwise.
12262
12263A string is a digit string if all characters in the string are digits and there
12264is at least one character in the string.
12265[clinic start generated code]*/
12266
12267static PyObject *
12268unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012269/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012270{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012271 Py_ssize_t i, length;
12272 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012273 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012274
12275 if (PyUnicode_READY(self) == -1)
12276 return NULL;
12277 length = PyUnicode_GET_LENGTH(self);
12278 kind = PyUnicode_KIND(self);
12279 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012280
Guido van Rossumd57fd912000-03-10 22:53:23 +000012281 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012282 if (length == 1) {
12283 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12284 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12285 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012286
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012287 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012288 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012289 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291 for (i = 0; i < length; i++) {
12292 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012293 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012294 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012295 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296}
12297
INADA Naoki3ae20562017-01-16 20:41:20 +090012298/*[clinic input]
12299str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012300
INADA Naoki3ae20562017-01-16 20:41:20 +090012301Return True if the string is a numeric string, False otherwise.
12302
12303A string is numeric if all characters in the string are numeric and there is at
12304least one character in the string.
12305[clinic start generated code]*/
12306
12307static PyObject *
12308unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012309/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012310{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012311 Py_ssize_t i, length;
12312 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012313 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012314
12315 if (PyUnicode_READY(self) == -1)
12316 return NULL;
12317 length = PyUnicode_GET_LENGTH(self);
12318 kind = PyUnicode_KIND(self);
12319 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012320
Guido van Rossumd57fd912000-03-10 22:53:23 +000012321 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322 if (length == 1)
12323 return PyBool_FromLong(
12324 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012325
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012326 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012327 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012328 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012329
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012330 for (i = 0; i < length; i++) {
12331 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012332 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012333 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012334 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012335}
12336
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012337Py_ssize_t
12338_PyUnicode_ScanIdentifier(PyObject *self)
Martin v. Löwis47383402007-08-15 07:32:56 +000012339{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012340 Py_ssize_t i;
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012341 if (PyUnicode_READY(self) == -1)
12342 return -1;
Martin v. Löwis47383402007-08-15 07:32:56 +000012343
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012344 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012345 if (len == 0) {
12346 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012347 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012348 }
12349
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012350 int kind = PyUnicode_KIND(self);
12351 const void *data = PyUnicode_DATA(self);
12352 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Martin v. Löwis47383402007-08-15 07:32:56 +000012353 /* PEP 3131 says that the first character must be in
12354 XID_Start and subsequent characters in XID_Continue,
12355 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012356 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012357 letters, digits, underscore). However, given the current
12358 definition of XID_Start and XID_Continue, it is sufficient
12359 to check just for these, except that _ must be allowed
12360 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012361 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012362 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012363 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012364
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012365 for (i = 1; i < len; i++) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012366 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012367 if (!_PyUnicode_IsXidContinue(ch)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012368 return i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012369 }
12370 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012371 return i;
12372}
12373
12374int
12375PyUnicode_IsIdentifier(PyObject *self)
12376{
12377 if (PyUnicode_IS_READY(self)) {
12378 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12379 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12380 /* an empty string is not a valid identifier */
12381 return len && i == len;
12382 }
12383 else {
Inada Naoki2c4928d2020-06-17 20:09:44 +090012384_Py_COMP_DIAG_PUSH
12385_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012386 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012387 if (len == 0) {
12388 /* an empty string is not a valid identifier */
12389 return 0;
12390 }
12391
12392 const wchar_t *wstr = _PyUnicode_WSTR(self);
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012393 Py_UCS4 ch = wstr[i++];
12394#if SIZEOF_WCHAR_T == 2
12395 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12396 && i < len
12397 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12398 {
12399 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12400 i++;
12401 }
12402#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012403 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12404 return 0;
12405 }
12406
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012407 while (i < len) {
12408 ch = wstr[i++];
12409#if SIZEOF_WCHAR_T == 2
12410 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12411 && i < len
12412 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12413 {
12414 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12415 i++;
12416 }
12417#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012418 if (!_PyUnicode_IsXidContinue(ch)) {
12419 return 0;
12420 }
12421 }
12422 return 1;
Inada Naoki2c4928d2020-06-17 20:09:44 +090012423_Py_COMP_DIAG_POP
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012424 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012425}
12426
INADA Naoki3ae20562017-01-16 20:41:20 +090012427/*[clinic input]
12428str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012429
INADA Naoki3ae20562017-01-16 20:41:20 +090012430Return True if the string is a valid Python identifier, False otherwise.
12431
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012432Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012433such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012434[clinic start generated code]*/
12435
12436static PyObject *
12437unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012438/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012439{
12440 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12441}
12442
INADA Naoki3ae20562017-01-16 20:41:20 +090012443/*[clinic input]
12444str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012445
INADA Naoki3ae20562017-01-16 20:41:20 +090012446Return True if the string is printable, False otherwise.
12447
12448A string is printable if all of its characters are considered printable in
12449repr() or if it is empty.
12450[clinic start generated code]*/
12451
12452static PyObject *
12453unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012454/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012455{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012456 Py_ssize_t i, length;
12457 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012458 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012459
12460 if (PyUnicode_READY(self) == -1)
12461 return NULL;
12462 length = PyUnicode_GET_LENGTH(self);
12463 kind = PyUnicode_KIND(self);
12464 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012465
12466 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012467 if (length == 1)
12468 return PyBool_FromLong(
12469 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012471 for (i = 0; i < length; i++) {
12472 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012473 Py_RETURN_FALSE;
12474 }
12475 }
12476 Py_RETURN_TRUE;
12477}
12478
INADA Naoki3ae20562017-01-16 20:41:20 +090012479/*[clinic input]
12480str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012481
INADA Naoki3ae20562017-01-16 20:41:20 +090012482 iterable: object
12483 /
12484
12485Concatenate any number of strings.
12486
Martin Panter91a88662017-01-24 00:30:06 +000012487The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012488The result is returned as a new string.
12489
12490Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12491[clinic start generated code]*/
12492
12493static PyObject *
12494unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012495/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012496{
INADA Naoki3ae20562017-01-16 20:41:20 +090012497 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012498}
12499
Martin v. Löwis18e16552006-02-15 17:27:45 +000012500static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012501unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012502{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012503 if (PyUnicode_READY(self) == -1)
12504 return -1;
12505 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012506}
12507
INADA Naoki3ae20562017-01-16 20:41:20 +090012508/*[clinic input]
12509str.ljust as unicode_ljust
12510
12511 width: Py_ssize_t
12512 fillchar: Py_UCS4 = ' '
12513 /
12514
12515Return a left-justified string of length width.
12516
12517Padding is done using the specified fill character (default is a space).
12518[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519
12520static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012521unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12522/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012523{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012524 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012525 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012526
Victor Stinnerc4b49542011-12-11 22:44:26 +010012527 if (PyUnicode_GET_LENGTH(self) >= width)
12528 return unicode_result_unchanged(self);
12529
12530 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531}
12532
INADA Naoki3ae20562017-01-16 20:41:20 +090012533/*[clinic input]
12534str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535
INADA Naoki3ae20562017-01-16 20:41:20 +090012536Return a copy of the string converted to lowercase.
12537[clinic start generated code]*/
12538
12539static PyObject *
12540unicode_lower_impl(PyObject *self)
12541/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012542{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012543 if (PyUnicode_READY(self) == -1)
12544 return NULL;
12545 if (PyUnicode_IS_ASCII(self))
12546 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012547 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012548}
12549
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012550#define LEFTSTRIP 0
12551#define RIGHTSTRIP 1
12552#define BOTHSTRIP 2
12553
12554/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012555static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012556
INADA Naoki3ae20562017-01-16 20:41:20 +090012557#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012558
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012559/* externally visible for str.strip(unicode) */
12560PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012561_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012562{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012563 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012564 int kind;
12565 Py_ssize_t i, j, len;
12566 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012567 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012569 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12570 return NULL;
12571
12572 kind = PyUnicode_KIND(self);
12573 data = PyUnicode_DATA(self);
12574 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012575 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012576 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12577 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012578 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012579
Benjamin Peterson14339b62009-01-31 16:36:08 +000012580 i = 0;
12581 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012582 while (i < len) {
12583 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12584 if (!BLOOM(sepmask, ch))
12585 break;
12586 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12587 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012588 i++;
12589 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012590 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012591
Benjamin Peterson14339b62009-01-31 16:36:08 +000012592 j = len;
12593 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012594 j--;
12595 while (j >= i) {
12596 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12597 if (!BLOOM(sepmask, ch))
12598 break;
12599 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12600 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012601 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012602 }
12603
Benjamin Peterson29060642009-01-31 22:14:21 +000012604 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012605 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012606
Victor Stinner7931d9a2011-11-04 00:22:48 +010012607 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012608}
12609
12610PyObject*
12611PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12612{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012613 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012614 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012615 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012616
Victor Stinnerde636f32011-10-01 03:55:54 +020012617 if (PyUnicode_READY(self) == -1)
12618 return NULL;
12619
Victor Stinner684d5fd2012-05-03 02:32:34 +020012620 length = PyUnicode_GET_LENGTH(self);
12621 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012622
Victor Stinner684d5fd2012-05-03 02:32:34 +020012623 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012624 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012625
Victor Stinnerde636f32011-10-01 03:55:54 +020012626 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012627 PyErr_SetString(PyExc_IndexError, "string index out of range");
12628 return NULL;
12629 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012630 if (start >= length || end < start)
12631 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012632
Victor Stinner684d5fd2012-05-03 02:32:34 +020012633 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012634 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012635 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012636 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012637 }
12638 else {
12639 kind = PyUnicode_KIND(self);
12640 data = PyUnicode_1BYTE_DATA(self);
12641 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012642 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012643 length);
12644 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012646
12647static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012648do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012649{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650 Py_ssize_t len, i, j;
12651
12652 if (PyUnicode_READY(self) == -1)
12653 return NULL;
12654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012655 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012656
Victor Stinnercc7af722013-04-09 22:39:24 +020012657 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012658 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012659
12660 i = 0;
12661 if (striptype != RIGHTSTRIP) {
12662 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012663 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012664 if (!_Py_ascii_whitespace[ch])
12665 break;
12666 i++;
12667 }
12668 }
12669
12670 j = len;
12671 if (striptype != LEFTSTRIP) {
12672 j--;
12673 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012674 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012675 if (!_Py_ascii_whitespace[ch])
12676 break;
12677 j--;
12678 }
12679 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012680 }
12681 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012682 else {
12683 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012684 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012685
Victor Stinnercc7af722013-04-09 22:39:24 +020012686 i = 0;
12687 if (striptype != RIGHTSTRIP) {
12688 while (i < len) {
12689 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12690 if (!Py_UNICODE_ISSPACE(ch))
12691 break;
12692 i++;
12693 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012694 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012695
12696 j = len;
12697 if (striptype != LEFTSTRIP) {
12698 j--;
12699 while (j >= i) {
12700 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12701 if (!Py_UNICODE_ISSPACE(ch))
12702 break;
12703 j--;
12704 }
12705 j++;
12706 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012707 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012708
Victor Stinner7931d9a2011-11-04 00:22:48 +010012709 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012710}
12711
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012712
12713static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012714do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012715{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012716 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012717 if (PyUnicode_Check(sep))
12718 return _PyUnicode_XStrip(self, striptype, sep);
12719 else {
12720 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012721 "%s arg must be None or str",
12722 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012723 return NULL;
12724 }
12725 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012726
Benjamin Peterson14339b62009-01-31 16:36:08 +000012727 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012728}
12729
12730
INADA Naoki3ae20562017-01-16 20:41:20 +090012731/*[clinic input]
12732str.strip as unicode_strip
12733
12734 chars: object = None
12735 /
12736
Zachary Ware09895c22019-10-09 16:09:00 -050012737Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012738
12739If chars is given and not None, remove characters in chars instead.
12740[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012741
12742static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012743unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012744/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012745{
INADA Naoki3ae20562017-01-16 20:41:20 +090012746 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012747}
12748
12749
INADA Naoki3ae20562017-01-16 20:41:20 +090012750/*[clinic input]
12751str.lstrip as unicode_lstrip
12752
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012753 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012754 /
12755
12756Return a copy of the string with leading whitespace removed.
12757
12758If chars is given and not None, remove characters in chars instead.
12759[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012760
12761static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012762unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012763/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012764{
INADA Naoki3ae20562017-01-16 20:41:20 +090012765 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012766}
12767
12768
INADA Naoki3ae20562017-01-16 20:41:20 +090012769/*[clinic input]
12770str.rstrip as unicode_rstrip
12771
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012772 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012773 /
12774
12775Return a copy of the string with trailing whitespace removed.
12776
12777If chars is given and not None, remove characters in chars instead.
12778[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012779
12780static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012781unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012782/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012783{
INADA Naoki3ae20562017-01-16 20:41:20 +090012784 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012785}
12786
12787
Guido van Rossumd57fd912000-03-10 22:53:23 +000012788static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012789unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012790{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012791 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012792 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012793
Serhiy Storchaka05997252013-01-26 12:14:02 +020012794 if (len < 1)
12795 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012796
Victor Stinnerc4b49542011-12-11 22:44:26 +010012797 /* no repeat, return original string */
12798 if (len == 1)
12799 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012800
Benjamin Petersonbac79492012-01-14 13:34:47 -050012801 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012802 return NULL;
12803
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012804 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012805 PyErr_SetString(PyExc_OverflowError,
12806 "repeated string is too long");
12807 return NULL;
12808 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012809 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012810
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012811 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012812 if (!u)
12813 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012814 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012816 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012817 int kind = PyUnicode_KIND(str);
12818 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012819 if (kind == PyUnicode_1BYTE_KIND) {
12820 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012821 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012822 }
12823 else if (kind == PyUnicode_2BYTE_KIND) {
12824 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012825 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012826 ucs2[n] = fill_char;
12827 } else {
12828 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12829 assert(kind == PyUnicode_4BYTE_KIND);
12830 for (n = 0; n < len; ++n)
12831 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012832 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012833 }
12834 else {
12835 /* number of characters copied this far */
12836 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012837 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012838 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012839 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012840 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012841 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012842 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012843 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012844 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012845 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012846 }
12847
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012848 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012849 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012850}
12851
Alexander Belopolsky40018472011-02-26 01:02:56 +000012852PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012853PyUnicode_Replace(PyObject *str,
12854 PyObject *substr,
12855 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012856 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012857{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012858 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12859 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012860 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012861 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012862}
12863
INADA Naoki3ae20562017-01-16 20:41:20 +090012864/*[clinic input]
12865str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012866
INADA Naoki3ae20562017-01-16 20:41:20 +090012867 old: unicode
12868 new: unicode
12869 count: Py_ssize_t = -1
12870 Maximum number of occurrences to replace.
12871 -1 (the default value) means replace all occurrences.
12872 /
12873
12874Return a copy with all occurrences of substring old replaced by new.
12875
12876If the optional argument count is given, only the first count occurrences are
12877replaced.
12878[clinic start generated code]*/
12879
12880static PyObject *
12881unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12882 Py_ssize_t count)
12883/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012884{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012885 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012886 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012887 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012888}
12889
sweeneydea81849b2020-04-22 17:05:48 -040012890/*[clinic input]
12891str.removeprefix as unicode_removeprefix
12892
12893 prefix: unicode
12894 /
12895
12896Return a str with the given prefix string removed if present.
12897
12898If the string starts with the prefix string, return string[len(prefix):].
12899Otherwise, return a copy of the original string.
12900[clinic start generated code]*/
12901
12902static PyObject *
12903unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12904/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
12905{
12906 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12907 if (match == -1) {
12908 return NULL;
12909 }
12910 if (match) {
12911 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12912 PyUnicode_GET_LENGTH(self));
12913 }
12914 return unicode_result_unchanged(self);
12915}
12916
12917/*[clinic input]
12918str.removesuffix as unicode_removesuffix
12919
12920 suffix: unicode
12921 /
12922
12923Return a str with the given suffix string removed if present.
12924
12925If the string ends with the suffix string and that suffix is not empty,
12926return string[:-len(suffix)]. Otherwise, return a copy of the original
12927string.
12928[clinic start generated code]*/
12929
12930static PyObject *
12931unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12932/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12933{
12934 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12935 if (match == -1) {
12936 return NULL;
12937 }
12938 if (match) {
12939 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12940 - PyUnicode_GET_LENGTH(suffix));
12941 }
12942 return unicode_result_unchanged(self);
12943}
12944
Alexander Belopolsky40018472011-02-26 01:02:56 +000012945static PyObject *
12946unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012947{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012948 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012949 Py_ssize_t isize;
12950 Py_ssize_t osize, squote, dquote, i, o;
12951 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012952 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012953 const void *idata;
12954 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012956 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012957 return NULL;
12958
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012959 isize = PyUnicode_GET_LENGTH(unicode);
12960 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012962 /* Compute length of output, quote characters, and
12963 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012964 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012965 max = 127;
12966 squote = dquote = 0;
12967 ikind = PyUnicode_KIND(unicode);
12968 for (i = 0; i < isize; i++) {
12969 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012970 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012971 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012972 case '\'': squote++; break;
12973 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012974 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012975 incr = 2;
12976 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012977 default:
12978 /* Fast-path ASCII */
12979 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012980 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012981 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012982 ;
12983 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012984 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012985 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012986 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012987 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012988 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012989 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012990 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012991 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012992 if (osize > PY_SSIZE_T_MAX - incr) {
12993 PyErr_SetString(PyExc_OverflowError,
12994 "string is too long to generate repr");
12995 return NULL;
12996 }
12997 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012998 }
12999
13000 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020013001 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013002 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020013003 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013004 if (dquote)
13005 /* Both squote and dquote present. Use squote,
13006 and escape them */
13007 osize += squote;
13008 else
13009 quote = '"';
13010 }
Victor Stinner55c08782013-04-14 18:45:39 +020013011 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013012
13013 repr = PyUnicode_New(osize, max);
13014 if (repr == NULL)
13015 return NULL;
13016 okind = PyUnicode_KIND(repr);
13017 odata = PyUnicode_DATA(repr);
13018
13019 PyUnicode_WRITE(okind, odata, 0, quote);
13020 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020013021 if (unchanged) {
13022 _PyUnicode_FastCopyCharacters(repr, 1,
13023 unicode, 0,
13024 isize);
13025 }
13026 else {
13027 for (i = 0, o = 1; i < isize; i++) {
13028 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013029
Victor Stinner55c08782013-04-14 18:45:39 +020013030 /* Escape quotes and backslashes */
13031 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000013032 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013033 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020013034 continue;
13035 }
13036
13037 /* Map special whitespace to '\t', \n', '\r' */
13038 if (ch == '\t') {
13039 PyUnicode_WRITE(okind, odata, o++, '\\');
13040 PyUnicode_WRITE(okind, odata, o++, 't');
13041 }
13042 else if (ch == '\n') {
13043 PyUnicode_WRITE(okind, odata, o++, '\\');
13044 PyUnicode_WRITE(okind, odata, o++, 'n');
13045 }
13046 else if (ch == '\r') {
13047 PyUnicode_WRITE(okind, odata, o++, '\\');
13048 PyUnicode_WRITE(okind, odata, o++, 'r');
13049 }
13050
13051 /* Map non-printable US ASCII to '\xhh' */
13052 else if (ch < ' ' || ch == 0x7F) {
13053 PyUnicode_WRITE(okind, odata, o++, '\\');
13054 PyUnicode_WRITE(okind, odata, o++, 'x');
13055 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13056 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13057 }
13058
13059 /* Copy ASCII characters as-is */
13060 else if (ch < 0x7F) {
13061 PyUnicode_WRITE(okind, odata, o++, ch);
13062 }
13063
13064 /* Non-ASCII characters */
13065 else {
13066 /* Map Unicode whitespace and control characters
13067 (categories Z* and C* except ASCII space)
13068 */
13069 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13070 PyUnicode_WRITE(okind, odata, o++, '\\');
13071 /* Map 8-bit characters to '\xhh' */
13072 if (ch <= 0xff) {
13073 PyUnicode_WRITE(okind, odata, o++, 'x');
13074 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13075 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13076 }
13077 /* Map 16-bit characters to '\uxxxx' */
13078 else if (ch <= 0xffff) {
13079 PyUnicode_WRITE(okind, odata, o++, 'u');
13080 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13081 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13082 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13083 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13084 }
13085 /* Map 21-bit characters to '\U00xxxxxx' */
13086 else {
13087 PyUnicode_WRITE(okind, odata, o++, 'U');
13088 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13089 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13090 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13091 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13092 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13093 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13094 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13095 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13096 }
13097 }
13098 /* Copy characters as-is */
13099 else {
13100 PyUnicode_WRITE(okind, odata, o++, ch);
13101 }
Georg Brandl559e5d72008-06-11 18:37:52 +000013102 }
13103 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000013104 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013105 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020013106 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000013107 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013108}
13109
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013110PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013111 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013112\n\
13113Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080013114such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013115arguments start and end are interpreted as in slice notation.\n\
13116\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013117Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013118
13119static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013120unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013121{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013122 /* initialize variables to prevent gcc warning */
13123 PyObject *substring = NULL;
13124 Py_ssize_t start = 0;
13125 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013126 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013127
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013128 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013129 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013130
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013131 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013132 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013133
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013134 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013136 if (result == -2)
13137 return NULL;
13138
Christian Heimes217cfd12007-12-02 14:31:20 +000013139 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013140}
13141
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013142PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013143 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013144\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070013145Return the highest index in S where substring sub is found,\n\
13146such that sub is contained within S[start:end]. Optional\n\
13147arguments start and end are interpreted as in slice notation.\n\
13148\n\
13149Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013150
13151static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013152unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013153{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013154 /* initialize variables to prevent gcc warning */
13155 PyObject *substring = NULL;
13156 Py_ssize_t start = 0;
13157 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013158 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013159
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013160 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013161 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013162
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013163 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013164 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013165
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013166 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013168 if (result == -2)
13169 return NULL;
13170
Guido van Rossumd57fd912000-03-10 22:53:23 +000013171 if (result < 0) {
13172 PyErr_SetString(PyExc_ValueError, "substring not found");
13173 return NULL;
13174 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013175
Christian Heimes217cfd12007-12-02 14:31:20 +000013176 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013177}
13178
INADA Naoki3ae20562017-01-16 20:41:20 +090013179/*[clinic input]
13180str.rjust as unicode_rjust
13181
13182 width: Py_ssize_t
13183 fillchar: Py_UCS4 = ' '
13184 /
13185
13186Return a right-justified string of length width.
13187
13188Padding is done using the specified fill character (default is a space).
13189[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013190
13191static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013192unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13193/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013194{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013195 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013196 return NULL;
13197
Victor Stinnerc4b49542011-12-11 22:44:26 +010013198 if (PyUnicode_GET_LENGTH(self) >= width)
13199 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013200
Victor Stinnerc4b49542011-12-11 22:44:26 +010013201 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013202}
13203
Alexander Belopolsky40018472011-02-26 01:02:56 +000013204PyObject *
13205PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013206{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013207 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013208 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013209
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013210 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013211}
13212
INADA Naoki3ae20562017-01-16 20:41:20 +090013213/*[clinic input]
13214str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013215
INADA Naoki3ae20562017-01-16 20:41:20 +090013216 sep: object = None
13217 The delimiter according which to split the string.
13218 None (the default value) means split according to any whitespace,
13219 and discard empty strings from the result.
13220 maxsplit: Py_ssize_t = -1
13221 Maximum number of splits to do.
13222 -1 (the default value) means no limit.
13223
13224Return a list of the words in the string, using sep as the delimiter string.
13225[clinic start generated code]*/
13226
13227static PyObject *
13228unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13229/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013230{
INADA Naoki3ae20562017-01-16 20:41:20 +090013231 if (sep == Py_None)
13232 return split(self, NULL, maxsplit);
13233 if (PyUnicode_Check(sep))
13234 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013235
Victor Stinner998b8062018-09-12 00:23:25 +020013236 PyErr_Format(PyExc_TypeError,
13237 "must be str or None, not %.100s",
13238 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013239 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013240}
13241
Thomas Wouters477c8d52006-05-27 19:21:47 +000013242PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013243PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013244{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013245 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013246 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013247 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013248 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013249
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013250 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013251 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013252
Victor Stinner14f8f022011-10-05 20:58:25 +020013253 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013254 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013255 len1 = PyUnicode_GET_LENGTH(str_obj);
13256 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013257 if (kind1 < kind2 || len1 < len2) {
13258 _Py_INCREF_UNICODE_EMPTY();
13259 if (!unicode_empty)
13260 out = NULL;
13261 else {
13262 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
13263 Py_DECREF(unicode_empty);
13264 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013265 return out;
13266 }
13267 buf1 = PyUnicode_DATA(str_obj);
13268 buf2 = PyUnicode_DATA(sep_obj);
13269 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013270 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013271 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013272 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013273 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013274
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013275 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013276 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013277 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13278 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13279 else
13280 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013281 break;
13282 case PyUnicode_2BYTE_KIND:
13283 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13284 break;
13285 case PyUnicode_4BYTE_KIND:
13286 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13287 break;
13288 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013289 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013290 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013291
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013292 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013293 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013294 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013295
13296 return out;
13297}
13298
13299
13300PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013301PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013302{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013303 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013304 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013305 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013306 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013307
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013308 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013309 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013310
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013311 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013312 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013313 len1 = PyUnicode_GET_LENGTH(str_obj);
13314 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013315 if (kind1 < kind2 || len1 < len2) {
13316 _Py_INCREF_UNICODE_EMPTY();
13317 if (!unicode_empty)
13318 out = NULL;
13319 else {
13320 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13321 Py_DECREF(unicode_empty);
13322 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013323 return out;
13324 }
13325 buf1 = PyUnicode_DATA(str_obj);
13326 buf2 = PyUnicode_DATA(sep_obj);
13327 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013328 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013329 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013330 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013331 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013332
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013333 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013334 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013335 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13336 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13337 else
13338 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013339 break;
13340 case PyUnicode_2BYTE_KIND:
13341 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13342 break;
13343 case PyUnicode_4BYTE_KIND:
13344 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13345 break;
13346 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013347 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013348 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013349
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013350 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013351 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013352 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013353
13354 return out;
13355}
13356
INADA Naoki3ae20562017-01-16 20:41:20 +090013357/*[clinic input]
13358str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013359
INADA Naoki3ae20562017-01-16 20:41:20 +090013360 sep: object
13361 /
13362
13363Partition the string into three parts using the given separator.
13364
13365This will search for the separator in the string. If the separator is found,
13366returns a 3-tuple containing the part before the separator, the separator
13367itself, and the part after it.
13368
13369If the separator is not found, returns a 3-tuple containing the original string
13370and two empty strings.
13371[clinic start generated code]*/
13372
13373static PyObject *
13374unicode_partition(PyObject *self, PyObject *sep)
13375/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013376{
INADA Naoki3ae20562017-01-16 20:41:20 +090013377 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013378}
13379
INADA Naoki3ae20562017-01-16 20:41:20 +090013380/*[clinic input]
13381str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013382
INADA Naoki3ae20562017-01-16 20:41:20 +090013383Partition the string into three parts using the given separator.
13384
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013385This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013386the separator is found, returns a 3-tuple containing the part before the
13387separator, the separator itself, and the part after it.
13388
13389If the separator is not found, returns a 3-tuple containing two empty strings
13390and the original string.
13391[clinic start generated code]*/
13392
13393static PyObject *
13394unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013395/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013396{
INADA Naoki3ae20562017-01-16 20:41:20 +090013397 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013398}
13399
Alexander Belopolsky40018472011-02-26 01:02:56 +000013400PyObject *
13401PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013402{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013403 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013404 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013405
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013406 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013407}
13408
INADA Naoki3ae20562017-01-16 20:41:20 +090013409/*[clinic input]
13410str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013411
INADA Naoki3ae20562017-01-16 20:41:20 +090013412Return a list of the words in the string, using sep as the delimiter string.
13413
13414Splits are done starting at the end of the string and working to the front.
13415[clinic start generated code]*/
13416
13417static PyObject *
13418unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13419/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013420{
INADA Naoki3ae20562017-01-16 20:41:20 +090013421 if (sep == Py_None)
13422 return rsplit(self, NULL, maxsplit);
13423 if (PyUnicode_Check(sep))
13424 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013425
Victor Stinner998b8062018-09-12 00:23:25 +020013426 PyErr_Format(PyExc_TypeError,
13427 "must be str or None, not %.100s",
13428 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013429 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013430}
13431
INADA Naoki3ae20562017-01-16 20:41:20 +090013432/*[clinic input]
13433str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013434
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013435 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013436
13437Return a list of the lines in the string, breaking at line boundaries.
13438
13439Line breaks are not included in the resulting list unless keepends is given and
13440true.
13441[clinic start generated code]*/
13442
13443static PyObject *
13444unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013445/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013446{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013447 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013448}
13449
13450static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013451PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013452{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013453 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013454}
13455
INADA Naoki3ae20562017-01-16 20:41:20 +090013456/*[clinic input]
13457str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013458
INADA Naoki3ae20562017-01-16 20:41:20 +090013459Convert uppercase characters to lowercase and lowercase characters to uppercase.
13460[clinic start generated code]*/
13461
13462static PyObject *
13463unicode_swapcase_impl(PyObject *self)
13464/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013465{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013466 if (PyUnicode_READY(self) == -1)
13467 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013468 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013469}
13470
Larry Hastings61272b72014-01-07 12:41:53 -080013471/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013472
Larry Hastings31826802013-10-19 00:09:25 -070013473@staticmethod
13474str.maketrans as unicode_maketrans
13475
13476 x: object
13477
13478 y: unicode=NULL
13479
13480 z: unicode=NULL
13481
13482 /
13483
13484Return a translation table usable for str.translate().
13485
13486If there is only one argument, it must be a dictionary mapping Unicode
13487ordinals (integers) or characters to Unicode ordinals, strings or None.
13488Character keys will be then converted to ordinals.
13489If there are two arguments, they must be strings of equal length, and
13490in the resulting dictionary, each character in x will be mapped to the
13491character at the same position in y. If there is a third argument, it
13492must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013493[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013494
Larry Hastings31826802013-10-19 00:09:25 -070013495static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013496unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013497/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013498{
Georg Brandlceee0772007-11-27 23:48:05 +000013499 PyObject *new = NULL, *key, *value;
13500 Py_ssize_t i = 0;
13501 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013502
Georg Brandlceee0772007-11-27 23:48:05 +000013503 new = PyDict_New();
13504 if (!new)
13505 return NULL;
13506 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013507 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013508 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013509
Georg Brandlceee0772007-11-27 23:48:05 +000013510 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013511 if (!PyUnicode_Check(x)) {
13512 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13513 "be a string if there is a second argument");
13514 goto err;
13515 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013516 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013517 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13518 "arguments must have equal length");
13519 goto err;
13520 }
13521 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013522 x_kind = PyUnicode_KIND(x);
13523 y_kind = PyUnicode_KIND(y);
13524 x_data = PyUnicode_DATA(x);
13525 y_data = PyUnicode_DATA(y);
13526 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13527 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013528 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013529 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013530 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013531 if (!value) {
13532 Py_DECREF(key);
13533 goto err;
13534 }
Georg Brandlceee0772007-11-27 23:48:05 +000013535 res = PyDict_SetItem(new, key, value);
13536 Py_DECREF(key);
13537 Py_DECREF(value);
13538 if (res < 0)
13539 goto err;
13540 }
13541 /* create entries for deleting chars in z */
13542 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013543 z_kind = PyUnicode_KIND(z);
13544 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013545 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013546 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013547 if (!key)
13548 goto err;
13549 res = PyDict_SetItem(new, key, Py_None);
13550 Py_DECREF(key);
13551 if (res < 0)
13552 goto err;
13553 }
13554 }
13555 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013556 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013557 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013558
Georg Brandlceee0772007-11-27 23:48:05 +000013559 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013560 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013561 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13562 "to maketrans it must be a dict");
13563 goto err;
13564 }
13565 /* copy entries into the new dict, converting string keys to int keys */
13566 while (PyDict_Next(x, &i, &key, &value)) {
13567 if (PyUnicode_Check(key)) {
13568 /* convert string keys to integer keys */
13569 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013570 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013571 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13572 "table must be of length 1");
13573 goto err;
13574 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013575 kind = PyUnicode_KIND(key);
13576 data = PyUnicode_DATA(key);
13577 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013578 if (!newkey)
13579 goto err;
13580 res = PyDict_SetItem(new, newkey, value);
13581 Py_DECREF(newkey);
13582 if (res < 0)
13583 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013584 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013585 /* just keep integer keys */
13586 if (PyDict_SetItem(new, key, value) < 0)
13587 goto err;
13588 } else {
13589 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13590 "be strings or integers");
13591 goto err;
13592 }
13593 }
13594 }
13595 return new;
13596 err:
13597 Py_DECREF(new);
13598 return NULL;
13599}
13600
INADA Naoki3ae20562017-01-16 20:41:20 +090013601/*[clinic input]
13602str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013603
INADA Naoki3ae20562017-01-16 20:41:20 +090013604 table: object
13605 Translation table, which must be a mapping of Unicode ordinals to
13606 Unicode ordinals, strings, or None.
13607 /
13608
13609Replace each character in the string using the given translation table.
13610
13611The table must implement lookup/indexing via __getitem__, for instance a
13612dictionary or list. If this operation raises LookupError, the character is
13613left untouched. Characters mapped to None are deleted.
13614[clinic start generated code]*/
13615
13616static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013617unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013618/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013619{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013620 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013621}
13622
INADA Naoki3ae20562017-01-16 20:41:20 +090013623/*[clinic input]
13624str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013625
INADA Naoki3ae20562017-01-16 20:41:20 +090013626Return a copy of the string converted to uppercase.
13627[clinic start generated code]*/
13628
13629static PyObject *
13630unicode_upper_impl(PyObject *self)
13631/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013632{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013633 if (PyUnicode_READY(self) == -1)
13634 return NULL;
13635 if (PyUnicode_IS_ASCII(self))
13636 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013637 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013638}
13639
INADA Naoki3ae20562017-01-16 20:41:20 +090013640/*[clinic input]
13641str.zfill as unicode_zfill
13642
13643 width: Py_ssize_t
13644 /
13645
13646Pad a numeric string with zeros on the left, to fill a field of the given width.
13647
13648The string is never truncated.
13649[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013650
13651static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013652unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013653/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013654{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013655 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013656 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013657 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013658 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013659 Py_UCS4 chr;
13660
Benjamin Petersonbac79492012-01-14 13:34:47 -050013661 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013662 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013663
Victor Stinnerc4b49542011-12-11 22:44:26 +010013664 if (PyUnicode_GET_LENGTH(self) >= width)
13665 return unicode_result_unchanged(self);
13666
13667 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013668
13669 u = pad(self, fill, 0, '0');
13670
Walter Dörwald068325e2002-04-15 13:36:47 +000013671 if (u == NULL)
13672 return NULL;
13673
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013674 kind = PyUnicode_KIND(u);
13675 data = PyUnicode_DATA(u);
13676 chr = PyUnicode_READ(kind, data, fill);
13677
13678 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013679 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013680 PyUnicode_WRITE(kind, data, 0, chr);
13681 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013682 }
13683
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013684 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013685 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013686}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013687
13688#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013689static PyObject *
13690unicode__decimal2ascii(PyObject *self)
13691{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013692 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013693}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013694#endif
13695
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013696PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013697 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013698\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013699Return True if S starts with the specified prefix, False otherwise.\n\
13700With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013701With optional end, stop comparing S at that position.\n\
13702prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013703
13704static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013705unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013706 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013707{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013708 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013709 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013710 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013711 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013712 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013713
Jesus Ceaac451502011-04-20 17:09:23 +020013714 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013715 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013716 if (PyTuple_Check(subobj)) {
13717 Py_ssize_t i;
13718 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013719 substring = PyTuple_GET_ITEM(subobj, i);
13720 if (!PyUnicode_Check(substring)) {
13721 PyErr_Format(PyExc_TypeError,
13722 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013723 "not %.100s",
13724 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013725 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013726 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013727 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013728 if (result == -1)
13729 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013730 if (result) {
13731 Py_RETURN_TRUE;
13732 }
13733 }
13734 /* nothing matched */
13735 Py_RETURN_FALSE;
13736 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013737 if (!PyUnicode_Check(subobj)) {
13738 PyErr_Format(PyExc_TypeError,
13739 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013740 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013741 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013742 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013743 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013744 if (result == -1)
13745 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013746 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013747}
13748
13749
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013750PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013751 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013752\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013753Return True if S ends with the specified suffix, False otherwise.\n\
13754With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013755With optional end, stop comparing S at that position.\n\
13756suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013757
13758static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013759unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013760 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013761{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013762 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013763 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013764 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013765 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013766 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013767
Jesus Ceaac451502011-04-20 17:09:23 +020013768 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013769 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013770 if (PyTuple_Check(subobj)) {
13771 Py_ssize_t i;
13772 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013773 substring = PyTuple_GET_ITEM(subobj, i);
13774 if (!PyUnicode_Check(substring)) {
13775 PyErr_Format(PyExc_TypeError,
13776 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013777 "not %.100s",
13778 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013779 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013780 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013781 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013782 if (result == -1)
13783 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013784 if (result) {
13785 Py_RETURN_TRUE;
13786 }
13787 }
13788 Py_RETURN_FALSE;
13789 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013790 if (!PyUnicode_Check(subobj)) {
13791 PyErr_Format(PyExc_TypeError,
13792 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013793 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013794 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013795 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013796 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013797 if (result == -1)
13798 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013799 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013800}
13801
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013802static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013803_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013804{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013805 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13806 writer->data = PyUnicode_DATA(writer->buffer);
13807
13808 if (!writer->readonly) {
13809 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013810 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013811 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013812 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013813 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13814 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13815 writer->kind = PyUnicode_WCHAR_KIND;
13816 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13817
Victor Stinner8f674cc2013-04-17 23:02:17 +020013818 /* Copy-on-write mode: set buffer size to 0 so
13819 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13820 * next write. */
13821 writer->size = 0;
13822 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013823}
13824
Victor Stinnerd3f08822012-05-29 12:57:52 +020013825void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013826_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013827{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013828 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013829
13830 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013831 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013832
13833 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13834 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13835 writer->kind = PyUnicode_WCHAR_KIND;
13836 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013837}
13838
Inada Naoki770847a2019-06-24 12:30:24 +090013839// Initialize _PyUnicodeWriter with initial buffer
13840static inline void
13841_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13842{
13843 memset(writer, 0, sizeof(*writer));
13844 writer->buffer = buffer;
13845 _PyUnicodeWriter_Update(writer);
13846 writer->min_length = writer->size;
13847}
13848
Victor Stinnerd3f08822012-05-29 12:57:52 +020013849int
13850_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13851 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013852{
13853 Py_ssize_t newlen;
13854 PyObject *newbuffer;
13855
Victor Stinner2740e462016-09-06 16:58:36 -070013856 assert(maxchar <= MAX_UNICODE);
13857
Victor Stinnerca9381e2015-09-22 00:58:32 +020013858 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013859 assert((maxchar > writer->maxchar && length >= 0)
13860 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013861
Victor Stinner202fdca2012-05-07 12:47:02 +020013862 if (length > PY_SSIZE_T_MAX - writer->pos) {
13863 PyErr_NoMemory();
13864 return -1;
13865 }
13866 newlen = writer->pos + length;
13867
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013868 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013869
Victor Stinnerd3f08822012-05-29 12:57:52 +020013870 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013871 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013872 if (writer->overallocate
13873 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13874 /* overallocate to limit the number of realloc() */
13875 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013876 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013877 if (newlen < writer->min_length)
13878 newlen = writer->min_length;
13879
Victor Stinnerd3f08822012-05-29 12:57:52 +020013880 writer->buffer = PyUnicode_New(newlen, maxchar);
13881 if (writer->buffer == NULL)
13882 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013883 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013884 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013885 if (writer->overallocate
13886 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13887 /* overallocate to limit the number of realloc() */
13888 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013889 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013890 if (newlen < writer->min_length)
13891 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013892
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013893 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013894 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013895 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013896 newbuffer = PyUnicode_New(newlen, maxchar);
13897 if (newbuffer == NULL)
13898 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013899 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13900 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013901 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013902 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013903 }
13904 else {
13905 newbuffer = resize_compact(writer->buffer, newlen);
13906 if (newbuffer == NULL)
13907 return -1;
13908 }
13909 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013910 }
13911 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013912 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013913 newbuffer = PyUnicode_New(writer->size, maxchar);
13914 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013915 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013916 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13917 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013918 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013919 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013920 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013921 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013922
13923#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013924}
13925
Victor Stinnerca9381e2015-09-22 00:58:32 +020013926int
13927_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13928 enum PyUnicode_Kind kind)
13929{
13930 Py_UCS4 maxchar;
13931
13932 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13933 assert(writer->kind < kind);
13934
13935 switch (kind)
13936 {
13937 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13938 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13939 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13940 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013941 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013942 }
13943
13944 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13945}
13946
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013947static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013948_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013949{
Victor Stinner2740e462016-09-06 16:58:36 -070013950 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013951 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13952 return -1;
13953 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13954 writer->pos++;
13955 return 0;
13956}
13957
13958int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013959_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13960{
13961 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13962}
13963
13964int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013965_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13966{
13967 Py_UCS4 maxchar;
13968 Py_ssize_t len;
13969
13970 if (PyUnicode_READY(str) == -1)
13971 return -1;
13972 len = PyUnicode_GET_LENGTH(str);
13973 if (len == 0)
13974 return 0;
13975 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13976 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013977 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013978 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013979 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013980 Py_INCREF(str);
13981 writer->buffer = str;
13982 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013983 writer->pos += len;
13984 return 0;
13985 }
13986 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13987 return -1;
13988 }
13989 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13990 str, 0, len);
13991 writer->pos += len;
13992 return 0;
13993}
13994
Victor Stinnere215d962012-10-06 23:03:36 +020013995int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013996_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13997 Py_ssize_t start, Py_ssize_t end)
13998{
13999 Py_UCS4 maxchar;
14000 Py_ssize_t len;
14001
14002 if (PyUnicode_READY(str) == -1)
14003 return -1;
14004
14005 assert(0 <= start);
14006 assert(end <= PyUnicode_GET_LENGTH(str));
14007 assert(start <= end);
14008
14009 if (end == 0)
14010 return 0;
14011
14012 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
14013 return _PyUnicodeWriter_WriteStr(writer, str);
14014
14015 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14016 maxchar = _PyUnicode_FindMaxChar(str, start, end);
14017 else
14018 maxchar = writer->maxchar;
14019 len = end - start;
14020
14021 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14022 return -1;
14023
14024 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14025 str, start, len);
14026 writer->pos += len;
14027 return 0;
14028}
14029
14030int
Victor Stinner4a587072013-11-19 12:54:53 +010014031_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14032 const char *ascii, Py_ssize_t len)
14033{
14034 if (len == -1)
14035 len = strlen(ascii);
14036
Andy Lestere6be9b52020-02-11 20:28:35 -060014037 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010014038
14039 if (writer->buffer == NULL && !writer->overallocate) {
14040 PyObject *str;
14041
14042 str = _PyUnicode_FromASCII(ascii, len);
14043 if (str == NULL)
14044 return -1;
14045
14046 writer->readonly = 1;
14047 writer->buffer = str;
14048 _PyUnicodeWriter_Update(writer);
14049 writer->pos += len;
14050 return 0;
14051 }
14052
14053 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14054 return -1;
14055
14056 switch (writer->kind)
14057 {
14058 case PyUnicode_1BYTE_KIND:
14059 {
14060 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14061 Py_UCS1 *data = writer->data;
14062
Christian Heimesf051e432016-09-13 20:22:02 +020014063 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010014064 break;
14065 }
14066 case PyUnicode_2BYTE_KIND:
14067 {
14068 _PyUnicode_CONVERT_BYTES(
14069 Py_UCS1, Py_UCS2,
14070 ascii, ascii + len,
14071 (Py_UCS2 *)writer->data + writer->pos);
14072 break;
14073 }
14074 case PyUnicode_4BYTE_KIND:
14075 {
14076 _PyUnicode_CONVERT_BYTES(
14077 Py_UCS1, Py_UCS4,
14078 ascii, ascii + len,
14079 (Py_UCS4 *)writer->data + writer->pos);
14080 break;
14081 }
14082 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014083 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010014084 }
14085
14086 writer->pos += len;
14087 return 0;
14088}
14089
14090int
14091_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14092 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020014093{
14094 Py_UCS4 maxchar;
14095
Andy Lestere6be9b52020-02-11 20:28:35 -060014096 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020014097 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14098 return -1;
14099 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14100 writer->pos += len;
14101 return 0;
14102}
14103
Victor Stinnerd3f08822012-05-29 12:57:52 +020014104PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014105_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014106{
Victor Stinner15a0bd32013-07-08 22:29:55 +020014107 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014108
Victor Stinnerd3f08822012-05-29 12:57:52 +020014109 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014110 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020014111 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020014112 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014113
14114 str = writer->buffer;
14115 writer->buffer = NULL;
14116
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014117 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014118 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14119 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014120 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014121
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014122 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14123 PyObject *str2;
14124 str2 = resize_compact(str, writer->pos);
14125 if (str2 == NULL) {
14126 Py_DECREF(str);
14127 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014128 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014129 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014130 }
14131
Victor Stinner15a0bd32013-07-08 22:29:55 +020014132 assert(_PyUnicode_CheckConsistency(str, 1));
14133 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020014134}
14135
Victor Stinnerd3f08822012-05-29 12:57:52 +020014136void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014137_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014138{
14139 Py_CLEAR(writer->buffer);
14140}
14141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014142#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000014143
14144PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000014145 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000014146\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014147Return a formatted version of S, using substitutions from args and kwargs.\n\
14148The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000014149
Eric Smith27bbca62010-11-04 17:06:58 +000014150PyDoc_STRVAR(format_map__doc__,
14151 "S.format_map(mapping) -> str\n\
14152\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014153Return a formatted version of S, using substitutions from mapping.\n\
14154The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014155
INADA Naoki3ae20562017-01-16 20:41:20 +090014156/*[clinic input]
14157str.__format__ as unicode___format__
14158
14159 format_spec: unicode
14160 /
14161
14162Return a formatted version of the string as described by format_spec.
14163[clinic start generated code]*/
14164
Eric Smith4a7d76d2008-05-30 18:10:19 +000014165static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014166unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014167/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014168{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014169 _PyUnicodeWriter writer;
14170 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014171
Victor Stinnerd3f08822012-05-29 12:57:52 +020014172 if (PyUnicode_READY(self) == -1)
14173 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014174 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014175 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14176 self, format_spec, 0,
14177 PyUnicode_GET_LENGTH(format_spec));
14178 if (ret == -1) {
14179 _PyUnicodeWriter_Dealloc(&writer);
14180 return NULL;
14181 }
14182 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014183}
14184
INADA Naoki3ae20562017-01-16 20:41:20 +090014185/*[clinic input]
14186str.__sizeof__ as unicode_sizeof
14187
14188Return the size of the string in memory, in bytes.
14189[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014190
14191static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014192unicode_sizeof_impl(PyObject *self)
14193/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014194{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014195 Py_ssize_t size;
14196
14197 /* If it's a compact object, account for base structure +
14198 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014199 if (PyUnicode_IS_COMPACT_ASCII(self))
14200 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14201 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014202 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014203 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014204 else {
14205 /* If it is a two-block object, account for base object, and
14206 for character block if present. */
14207 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014208 if (_PyUnicode_DATA_ANY(self))
14209 size += (PyUnicode_GET_LENGTH(self) + 1) *
14210 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014211 }
14212 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014213 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014214 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14215 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14216 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14217 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014218
14219 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014220}
14221
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014222static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014223unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014224{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014225 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014226 if (!copy)
14227 return NULL;
14228 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014229}
14230
Guido van Rossumd57fd912000-03-10 22:53:23 +000014231static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014232 UNICODE_ENCODE_METHODDEF
14233 UNICODE_REPLACE_METHODDEF
14234 UNICODE_SPLIT_METHODDEF
14235 UNICODE_RSPLIT_METHODDEF
14236 UNICODE_JOIN_METHODDEF
14237 UNICODE_CAPITALIZE_METHODDEF
14238 UNICODE_CASEFOLD_METHODDEF
14239 UNICODE_TITLE_METHODDEF
14240 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014241 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014242 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014243 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014244 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014245 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014246 UNICODE_LJUST_METHODDEF
14247 UNICODE_LOWER_METHODDEF
14248 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014249 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14250 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014251 UNICODE_RJUST_METHODDEF
14252 UNICODE_RSTRIP_METHODDEF
14253 UNICODE_RPARTITION_METHODDEF
14254 UNICODE_SPLITLINES_METHODDEF
14255 UNICODE_STRIP_METHODDEF
14256 UNICODE_SWAPCASE_METHODDEF
14257 UNICODE_TRANSLATE_METHODDEF
14258 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014259 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14260 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
sweeneydea81849b2020-04-22 17:05:48 -040014261 UNICODE_REMOVEPREFIX_METHODDEF
14262 UNICODE_REMOVESUFFIX_METHODDEF
INADA Naokia49ac992018-01-27 14:06:21 +090014263 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014264 UNICODE_ISLOWER_METHODDEF
14265 UNICODE_ISUPPER_METHODDEF
14266 UNICODE_ISTITLE_METHODDEF
14267 UNICODE_ISSPACE_METHODDEF
14268 UNICODE_ISDECIMAL_METHODDEF
14269 UNICODE_ISDIGIT_METHODDEF
14270 UNICODE_ISNUMERIC_METHODDEF
14271 UNICODE_ISALPHA_METHODDEF
14272 UNICODE_ISALNUM_METHODDEF
14273 UNICODE_ISIDENTIFIER_METHODDEF
14274 UNICODE_ISPRINTABLE_METHODDEF
14275 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014276 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014277 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014278 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014279 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014280 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014281#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014282 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014283 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014284#endif
14285
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014286 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014287 {NULL, NULL}
14288};
14289
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014290static PyObject *
14291unicode_mod(PyObject *v, PyObject *w)
14292{
Brian Curtindfc80e32011-08-10 20:28:54 -050014293 if (!PyUnicode_Check(v))
14294 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014295 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014296}
14297
14298static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014299 0, /*nb_add*/
14300 0, /*nb_subtract*/
14301 0, /*nb_multiply*/
14302 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014303};
14304
Guido van Rossumd57fd912000-03-10 22:53:23 +000014305static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014306 (lenfunc) unicode_length, /* sq_length */
14307 PyUnicode_Concat, /* sq_concat */
14308 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14309 (ssizeargfunc) unicode_getitem, /* sq_item */
14310 0, /* sq_slice */
14311 0, /* sq_ass_item */
14312 0, /* sq_ass_slice */
14313 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014314};
14315
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014316static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014317unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014318{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014319 if (PyUnicode_READY(self) == -1)
14320 return NULL;
14321
Victor Stinnera15e2602020-04-08 02:01:56 +020014322 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014323 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014324 if (i == -1 && PyErr_Occurred())
14325 return NULL;
14326 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014327 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014328 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014329 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014330 Py_ssize_t start, stop, step, slicelength, i;
14331 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014332 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014333 const void *src_data;
14334 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014335 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014336 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014337
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014338 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014339 return NULL;
14340 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014341 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14342 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014343
14344 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014345 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014346 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014347 slicelength == PyUnicode_GET_LENGTH(self)) {
14348 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014349 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014350 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014351 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014352 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014353 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014354 src_kind = PyUnicode_KIND(self);
14355 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014356 if (!PyUnicode_IS_ASCII(self)) {
14357 kind_limit = kind_maxchar_limit(src_kind);
14358 max_char = 0;
14359 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14360 ch = PyUnicode_READ(src_kind, src_data, cur);
14361 if (ch > max_char) {
14362 max_char = ch;
14363 if (max_char >= kind_limit)
14364 break;
14365 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014366 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014367 }
Victor Stinner55c99112011-10-13 01:17:06 +020014368 else
14369 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014370 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014371 if (result == NULL)
14372 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014373 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014374 dest_data = PyUnicode_DATA(result);
14375
14376 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014377 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14378 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014379 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014380 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014381 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014382 } else {
14383 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14384 return NULL;
14385 }
14386}
14387
14388static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014389 (lenfunc)unicode_length, /* mp_length */
14390 (binaryfunc)unicode_subscript, /* mp_subscript */
14391 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014392};
14393
Guido van Rossumd57fd912000-03-10 22:53:23 +000014394
Guido van Rossumd57fd912000-03-10 22:53:23 +000014395/* Helpers for PyUnicode_Format() */
14396
Victor Stinnera47082312012-10-04 02:19:54 +020014397struct unicode_formatter_t {
14398 PyObject *args;
14399 int args_owned;
14400 Py_ssize_t arglen, argidx;
14401 PyObject *dict;
14402
14403 enum PyUnicode_Kind fmtkind;
14404 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014405 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014406 PyObject *fmtstr;
14407
14408 _PyUnicodeWriter writer;
14409};
14410
14411struct unicode_format_arg_t {
14412 Py_UCS4 ch;
14413 int flags;
14414 Py_ssize_t width;
14415 int prec;
14416 int sign;
14417};
14418
Guido van Rossumd57fd912000-03-10 22:53:23 +000014419static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014420unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014421{
Victor Stinnera47082312012-10-04 02:19:54 +020014422 Py_ssize_t argidx = ctx->argidx;
14423
14424 if (argidx < ctx->arglen) {
14425 ctx->argidx++;
14426 if (ctx->arglen < 0)
14427 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014428 else
Victor Stinnera47082312012-10-04 02:19:54 +020014429 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014430 }
14431 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014432 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014433 return NULL;
14434}
14435
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014436/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014437
Victor Stinnera47082312012-10-04 02:19:54 +020014438/* Format a float into the writer if the writer is not NULL, or into *p_output
14439 otherwise.
14440
14441 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014442static int
Victor Stinnera47082312012-10-04 02:19:54 +020014443formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14444 PyObject **p_output,
14445 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014446{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014447 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014448 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014449 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014450 int prec;
14451 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014452
Guido van Rossumd57fd912000-03-10 22:53:23 +000014453 x = PyFloat_AsDouble(v);
14454 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014455 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014456
Victor Stinnera47082312012-10-04 02:19:54 +020014457 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014458 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014459 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014460
Victor Stinnera47082312012-10-04 02:19:54 +020014461 if (arg->flags & F_ALT)
14462 dtoa_flags = Py_DTSF_ALT;
14463 else
14464 dtoa_flags = 0;
14465 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014466 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014467 return -1;
14468 len = strlen(p);
14469 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014470 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014471 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014472 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014473 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014474 }
14475 else
14476 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014477 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014478 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014479}
14480
Victor Stinnerd0880d52012-04-27 23:40:13 +020014481/* formatlong() emulates the format codes d, u, o, x and X, and
14482 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14483 * Python's regular ints.
14484 * Return value: a new PyUnicodeObject*, or NULL if error.
14485 * The output string is of the form
14486 * "-"? ("0x" | "0X")? digit+
14487 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14488 * set in flags. The case of hex digits will be correct,
14489 * There will be at least prec digits, zero-filled on the left if
14490 * necessary to get that many.
14491 * val object to be converted
14492 * flags bitmask of format flags; only F_ALT is looked at
14493 * prec minimum number of digits; 0-fill on left if needed
14494 * type a character in [duoxX]; u acts the same as d
14495 *
14496 * CAUTION: o, x and X conversions on regular ints can never
14497 * produce a '-' sign, but can for Python's unbounded ints.
14498 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014499PyObject *
14500_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014501{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014502 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014503 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014504 Py_ssize_t i;
14505 int sign; /* 1 if '-', else 0 */
14506 int len; /* number of characters */
14507 Py_ssize_t llen;
14508 int numdigits; /* len == numnondigits + numdigits */
14509 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014510
Victor Stinnerd0880d52012-04-27 23:40:13 +020014511 /* Avoid exceeding SSIZE_T_MAX */
14512 if (prec > INT_MAX-3) {
14513 PyErr_SetString(PyExc_OverflowError,
14514 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014515 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014516 }
14517
14518 assert(PyLong_Check(val));
14519
14520 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014521 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014522 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014523 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014524 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014525 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014526 /* int and int subclasses should print numerically when a numeric */
14527 /* format code is used (see issue18780) */
14528 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014529 break;
14530 case 'o':
14531 numnondigits = 2;
14532 result = PyNumber_ToBase(val, 8);
14533 break;
14534 case 'x':
14535 case 'X':
14536 numnondigits = 2;
14537 result = PyNumber_ToBase(val, 16);
14538 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014539 }
14540 if (!result)
14541 return NULL;
14542
14543 assert(unicode_modifiable(result));
14544 assert(PyUnicode_IS_READY(result));
14545 assert(PyUnicode_IS_ASCII(result));
14546
14547 /* To modify the string in-place, there can only be one reference. */
14548 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014549 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014550 PyErr_BadInternalCall();
14551 return NULL;
14552 }
14553 buf = PyUnicode_DATA(result);
14554 llen = PyUnicode_GET_LENGTH(result);
14555 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014556 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014557 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014558 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014559 return NULL;
14560 }
14561 len = (int)llen;
14562 sign = buf[0] == '-';
14563 numnondigits += sign;
14564 numdigits = len - numnondigits;
14565 assert(numdigits > 0);
14566
14567 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014568 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014569 (type == 'o' || type == 'x' || type == 'X'))) {
14570 assert(buf[sign] == '0');
14571 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14572 buf[sign+1] == 'o');
14573 numnondigits -= 2;
14574 buf += 2;
14575 len -= 2;
14576 if (sign)
14577 buf[0] = '-';
14578 assert(len == numnondigits + numdigits);
14579 assert(numdigits > 0);
14580 }
14581
14582 /* Fill with leading zeroes to meet minimum width. */
14583 if (prec > numdigits) {
14584 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14585 numnondigits + prec);
14586 char *b1;
14587 if (!r1) {
14588 Py_DECREF(result);
14589 return NULL;
14590 }
14591 b1 = PyBytes_AS_STRING(r1);
14592 for (i = 0; i < numnondigits; ++i)
14593 *b1++ = *buf++;
14594 for (i = 0; i < prec - numdigits; i++)
14595 *b1++ = '0';
14596 for (i = 0; i < numdigits; i++)
14597 *b1++ = *buf++;
14598 *b1 = '\0';
14599 Py_DECREF(result);
14600 result = r1;
14601 buf = PyBytes_AS_STRING(result);
14602 len = numnondigits + prec;
14603 }
14604
14605 /* Fix up case for hex conversions. */
14606 if (type == 'X') {
14607 /* Need to convert all lower case letters to upper case.
14608 and need to convert 0x to 0X (and -0x to -0X). */
14609 for (i = 0; i < len; i++)
14610 if (buf[i] >= 'a' && buf[i] <= 'x')
14611 buf[i] -= 'a'-'A';
14612 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014613 if (!PyUnicode_Check(result)
14614 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014615 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014616 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014617 Py_DECREF(result);
14618 result = unicode;
14619 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014620 else if (len != PyUnicode_GET_LENGTH(result)) {
14621 if (PyUnicode_Resize(&result, len) < 0)
14622 Py_CLEAR(result);
14623 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014624 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014625}
14626
Ethan Furmandf3ed242014-01-05 06:50:30 -080014627/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014628 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014629 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014630 * -1 and raise an exception on error */
14631static int
Victor Stinnera47082312012-10-04 02:19:54 +020014632mainformatlong(PyObject *v,
14633 struct unicode_format_arg_t *arg,
14634 PyObject **p_output,
14635 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014636{
14637 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014638 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014639
14640 if (!PyNumber_Check(v))
14641 goto wrongtype;
14642
Ethan Furman9ab74802014-03-21 06:38:46 -070014643 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014644 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014645 if (type == 'o' || type == 'x' || type == 'X') {
Serhiy Storchaka5f4b229d2020-05-28 10:33:45 +030014646 iobj = _PyNumber_Index(v);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014647 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014648 if (PyErr_ExceptionMatches(PyExc_TypeError))
14649 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014650 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014651 }
14652 }
14653 else {
14654 iobj = PyNumber_Long(v);
14655 if (iobj == NULL ) {
14656 if (PyErr_ExceptionMatches(PyExc_TypeError))
14657 goto wrongtype;
14658 return -1;
14659 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014660 }
14661 assert(PyLong_Check(iobj));
14662 }
14663 else {
14664 iobj = v;
14665 Py_INCREF(iobj);
14666 }
14667
14668 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014669 && arg->width == -1 && arg->prec == -1
14670 && !(arg->flags & (F_SIGN | F_BLANK))
14671 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014672 {
14673 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014674 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014675 int base;
14676
Victor Stinnera47082312012-10-04 02:19:54 +020014677 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014678 {
14679 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014680 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014681 case 'd':
14682 case 'i':
14683 case 'u':
14684 base = 10;
14685 break;
14686 case 'o':
14687 base = 8;
14688 break;
14689 case 'x':
14690 case 'X':
14691 base = 16;
14692 break;
14693 }
14694
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014695 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14696 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014697 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014698 }
14699 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014700 return 1;
14701 }
14702
Ethan Furmanb95b5612015-01-23 20:05:18 -080014703 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014704 Py_DECREF(iobj);
14705 if (res == NULL)
14706 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014707 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014708 return 0;
14709
14710wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014711 switch(type)
14712 {
14713 case 'o':
14714 case 'x':
14715 case 'X':
14716 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014717 "%%%c format: an integer is required, "
14718 "not %.200s",
14719 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014720 break;
14721 default:
14722 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014723 "%%%c format: a number is required, "
14724 "not %.200s",
14725 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014726 break;
14727 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014728 return -1;
14729}
14730
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014731static Py_UCS4
14732formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014733{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014734 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014735 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014736 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014737 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014738 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014739 goto onError;
14740 }
14741 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014742 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014743 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014744 /* make sure number is a type of integer */
14745 if (!PyLong_Check(v)) {
14746 iobj = PyNumber_Index(v);
14747 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014748 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014749 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014750 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014751 Py_DECREF(iobj);
14752 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014753 else {
14754 x = PyLong_AsLong(v);
14755 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014756 if (x == -1 && PyErr_Occurred())
14757 goto onError;
14758
Victor Stinner8faf8212011-12-08 22:14:11 +010014759 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014760 PyErr_SetString(PyExc_OverflowError,
14761 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014762 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014763 }
14764
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014765 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014766 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014767
Benjamin Peterson29060642009-01-31 22:14:21 +000014768 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014769 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014770 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014771 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014772}
14773
Victor Stinnera47082312012-10-04 02:19:54 +020014774/* Parse options of an argument: flags, width, precision.
14775 Handle also "%(name)" syntax.
14776
14777 Return 0 if the argument has been formatted into arg->str.
14778 Return 1 if the argument has been written into ctx->writer,
14779 Raise an exception and return -1 on error. */
14780static int
14781unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14782 struct unicode_format_arg_t *arg)
14783{
14784#define FORMAT_READ(ctx) \
14785 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14786
14787 PyObject *v;
14788
Victor Stinnera47082312012-10-04 02:19:54 +020014789 if (arg->ch == '(') {
14790 /* Get argument value from a dictionary. Example: "%(name)s". */
14791 Py_ssize_t keystart;
14792 Py_ssize_t keylen;
14793 PyObject *key;
14794 int pcount = 1;
14795
14796 if (ctx->dict == NULL) {
14797 PyErr_SetString(PyExc_TypeError,
14798 "format requires a mapping");
14799 return -1;
14800 }
14801 ++ctx->fmtpos;
14802 --ctx->fmtcnt;
14803 keystart = ctx->fmtpos;
14804 /* Skip over balanced parentheses */
14805 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14806 arg->ch = FORMAT_READ(ctx);
14807 if (arg->ch == ')')
14808 --pcount;
14809 else if (arg->ch == '(')
14810 ++pcount;
14811 ctx->fmtpos++;
14812 }
14813 keylen = ctx->fmtpos - keystart - 1;
14814 if (ctx->fmtcnt < 0 || pcount > 0) {
14815 PyErr_SetString(PyExc_ValueError,
14816 "incomplete format key");
14817 return -1;
14818 }
14819 key = PyUnicode_Substring(ctx->fmtstr,
14820 keystart, keystart + keylen);
14821 if (key == NULL)
14822 return -1;
14823 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014824 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014825 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014826 }
14827 ctx->args = PyObject_GetItem(ctx->dict, key);
14828 Py_DECREF(key);
14829 if (ctx->args == NULL)
14830 return -1;
14831 ctx->args_owned = 1;
14832 ctx->arglen = -1;
14833 ctx->argidx = -2;
14834 }
14835
14836 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014837 while (--ctx->fmtcnt >= 0) {
14838 arg->ch = FORMAT_READ(ctx);
14839 ctx->fmtpos++;
14840 switch (arg->ch) {
14841 case '-': arg->flags |= F_LJUST; continue;
14842 case '+': arg->flags |= F_SIGN; continue;
14843 case ' ': arg->flags |= F_BLANK; continue;
14844 case '#': arg->flags |= F_ALT; continue;
14845 case '0': arg->flags |= F_ZERO; continue;
14846 }
14847 break;
14848 }
14849
14850 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014851 if (arg->ch == '*') {
14852 v = unicode_format_getnextarg(ctx);
14853 if (v == NULL)
14854 return -1;
14855 if (!PyLong_Check(v)) {
14856 PyErr_SetString(PyExc_TypeError,
14857 "* wants int");
14858 return -1;
14859 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014860 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014861 if (arg->width == -1 && PyErr_Occurred())
14862 return -1;
14863 if (arg->width < 0) {
14864 arg->flags |= F_LJUST;
14865 arg->width = -arg->width;
14866 }
14867 if (--ctx->fmtcnt >= 0) {
14868 arg->ch = FORMAT_READ(ctx);
14869 ctx->fmtpos++;
14870 }
14871 }
14872 else if (arg->ch >= '0' && arg->ch <= '9') {
14873 arg->width = arg->ch - '0';
14874 while (--ctx->fmtcnt >= 0) {
14875 arg->ch = FORMAT_READ(ctx);
14876 ctx->fmtpos++;
14877 if (arg->ch < '0' || arg->ch > '9')
14878 break;
14879 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14880 mixing signed and unsigned comparison. Since arg->ch is between
14881 '0' and '9', casting to int is safe. */
14882 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14883 PyErr_SetString(PyExc_ValueError,
14884 "width too big");
14885 return -1;
14886 }
14887 arg->width = arg->width*10 + (arg->ch - '0');
14888 }
14889 }
14890
14891 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014892 if (arg->ch == '.') {
14893 arg->prec = 0;
14894 if (--ctx->fmtcnt >= 0) {
14895 arg->ch = FORMAT_READ(ctx);
14896 ctx->fmtpos++;
14897 }
14898 if (arg->ch == '*') {
14899 v = unicode_format_getnextarg(ctx);
14900 if (v == NULL)
14901 return -1;
14902 if (!PyLong_Check(v)) {
14903 PyErr_SetString(PyExc_TypeError,
14904 "* wants int");
14905 return -1;
14906 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014907 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014908 if (arg->prec == -1 && PyErr_Occurred())
14909 return -1;
14910 if (arg->prec < 0)
14911 arg->prec = 0;
14912 if (--ctx->fmtcnt >= 0) {
14913 arg->ch = FORMAT_READ(ctx);
14914 ctx->fmtpos++;
14915 }
14916 }
14917 else if (arg->ch >= '0' && arg->ch <= '9') {
14918 arg->prec = arg->ch - '0';
14919 while (--ctx->fmtcnt >= 0) {
14920 arg->ch = FORMAT_READ(ctx);
14921 ctx->fmtpos++;
14922 if (arg->ch < '0' || arg->ch > '9')
14923 break;
14924 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14925 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014926 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014927 return -1;
14928 }
14929 arg->prec = arg->prec*10 + (arg->ch - '0');
14930 }
14931 }
14932 }
14933
14934 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14935 if (ctx->fmtcnt >= 0) {
14936 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14937 if (--ctx->fmtcnt >= 0) {
14938 arg->ch = FORMAT_READ(ctx);
14939 ctx->fmtpos++;
14940 }
14941 }
14942 }
14943 if (ctx->fmtcnt < 0) {
14944 PyErr_SetString(PyExc_ValueError,
14945 "incomplete format");
14946 return -1;
14947 }
14948 return 0;
14949
14950#undef FORMAT_READ
14951}
14952
14953/* Format one argument. Supported conversion specifiers:
14954
14955 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014956 - "i", "d", "u": int or float
14957 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014958 - "e", "E", "f", "F", "g", "G": float
14959 - "c": int or str (1 character)
14960
Victor Stinner8dbd4212012-12-04 09:30:24 +010014961 When possible, the output is written directly into the Unicode writer
14962 (ctx->writer). A string is created when padding is required.
14963
Victor Stinnera47082312012-10-04 02:19:54 +020014964 Return 0 if the argument has been formatted into *p_str,
14965 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014966 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014967static int
14968unicode_format_arg_format(struct unicode_formatter_t *ctx,
14969 struct unicode_format_arg_t *arg,
14970 PyObject **p_str)
14971{
14972 PyObject *v;
14973 _PyUnicodeWriter *writer = &ctx->writer;
14974
14975 if (ctx->fmtcnt == 0)
14976 ctx->writer.overallocate = 0;
14977
Victor Stinnera47082312012-10-04 02:19:54 +020014978 v = unicode_format_getnextarg(ctx);
14979 if (v == NULL)
14980 return -1;
14981
Victor Stinnera47082312012-10-04 02:19:54 +020014982
14983 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014984 case 's':
14985 case 'r':
14986 case 'a':
14987 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14988 /* Fast path */
14989 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14990 return -1;
14991 return 1;
14992 }
14993
14994 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14995 *p_str = v;
14996 Py_INCREF(*p_str);
14997 }
14998 else {
14999 if (arg->ch == 's')
15000 *p_str = PyObject_Str(v);
15001 else if (arg->ch == 'r')
15002 *p_str = PyObject_Repr(v);
15003 else
15004 *p_str = PyObject_ASCII(v);
15005 }
15006 break;
15007
15008 case 'i':
15009 case 'd':
15010 case 'u':
15011 case 'o':
15012 case 'x':
15013 case 'X':
15014 {
15015 int ret = mainformatlong(v, arg, p_str, writer);
15016 if (ret != 0)
15017 return ret;
15018 arg->sign = 1;
15019 break;
15020 }
15021
15022 case 'e':
15023 case 'E':
15024 case 'f':
15025 case 'F':
15026 case 'g':
15027 case 'G':
15028 if (arg->width == -1 && arg->prec == -1
15029 && !(arg->flags & (F_SIGN | F_BLANK)))
15030 {
15031 /* Fast path */
15032 if (formatfloat(v, arg, NULL, writer) == -1)
15033 return -1;
15034 return 1;
15035 }
15036
15037 arg->sign = 1;
15038 if (formatfloat(v, arg, p_str, NULL) == -1)
15039 return -1;
15040 break;
15041
15042 case 'c':
15043 {
15044 Py_UCS4 ch = formatchar(v);
15045 if (ch == (Py_UCS4) -1)
15046 return -1;
15047 if (arg->width == -1 && arg->prec == -1) {
15048 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020015049 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020015050 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020015051 return 1;
15052 }
15053 *p_str = PyUnicode_FromOrdinal(ch);
15054 break;
15055 }
15056
15057 default:
15058 PyErr_Format(PyExc_ValueError,
15059 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020015060 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020015061 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15062 (int)arg->ch,
15063 ctx->fmtpos - 1);
15064 return -1;
15065 }
15066 if (*p_str == NULL)
15067 return -1;
15068 assert (PyUnicode_Check(*p_str));
15069 return 0;
15070}
15071
15072static int
15073unicode_format_arg_output(struct unicode_formatter_t *ctx,
15074 struct unicode_format_arg_t *arg,
15075 PyObject *str)
15076{
15077 Py_ssize_t len;
15078 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015079 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020015080 Py_ssize_t pindex;
15081 Py_UCS4 signchar;
15082 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015083 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015084 Py_ssize_t sublen;
15085 _PyUnicodeWriter *writer = &ctx->writer;
15086 Py_UCS4 fill;
15087
15088 fill = ' ';
15089 if (arg->sign && arg->flags & F_ZERO)
15090 fill = '0';
15091
15092 if (PyUnicode_READY(str) == -1)
15093 return -1;
15094
15095 len = PyUnicode_GET_LENGTH(str);
15096 if ((arg->width == -1 || arg->width <= len)
15097 && (arg->prec == -1 || arg->prec >= len)
15098 && !(arg->flags & (F_SIGN | F_BLANK)))
15099 {
15100 /* Fast path */
15101 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15102 return -1;
15103 return 0;
15104 }
15105
15106 /* Truncate the string for "s", "r" and "a" formats
15107 if the precision is set */
15108 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15109 if (arg->prec >= 0 && len > arg->prec)
15110 len = arg->prec;
15111 }
15112
15113 /* Adjust sign and width */
15114 kind = PyUnicode_KIND(str);
15115 pbuf = PyUnicode_DATA(str);
15116 pindex = 0;
15117 signchar = '\0';
15118 if (arg->sign) {
15119 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15120 if (ch == '-' || ch == '+') {
15121 signchar = ch;
15122 len--;
15123 pindex++;
15124 }
15125 else if (arg->flags & F_SIGN)
15126 signchar = '+';
15127 else if (arg->flags & F_BLANK)
15128 signchar = ' ';
15129 else
15130 arg->sign = 0;
15131 }
15132 if (arg->width < len)
15133 arg->width = len;
15134
15135 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015136 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015137 if (!(arg->flags & F_LJUST)) {
15138 if (arg->sign) {
15139 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015140 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015141 }
15142 else {
15143 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015144 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015145 }
15146 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015147 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15148 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015149 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015150 }
15151
Victor Stinnera47082312012-10-04 02:19:54 +020015152 buflen = arg->width;
15153 if (arg->sign && len == arg->width)
15154 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015155 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015156 return -1;
15157
15158 /* Write the sign if needed */
15159 if (arg->sign) {
15160 if (fill != ' ') {
15161 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15162 writer->pos += 1;
15163 }
15164 if (arg->width > len)
15165 arg->width--;
15166 }
15167
15168 /* Write the numeric prefix for "x", "X" and "o" formats
15169 if the alternate form is used.
15170 For example, write "0x" for the "%#x" format. */
15171 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15172 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15173 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15174 if (fill != ' ') {
15175 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15176 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15177 writer->pos += 2;
15178 pindex += 2;
15179 }
15180 arg->width -= 2;
15181 if (arg->width < 0)
15182 arg->width = 0;
15183 len -= 2;
15184 }
15185
15186 /* Pad left with the fill character if needed */
15187 if (arg->width > len && !(arg->flags & F_LJUST)) {
15188 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015189 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015190 writer->pos += sublen;
15191 arg->width = len;
15192 }
15193
15194 /* If padding with spaces: write sign if needed and/or numeric prefix if
15195 the alternate form is used */
15196 if (fill == ' ') {
15197 if (arg->sign) {
15198 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15199 writer->pos += 1;
15200 }
15201 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15202 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15203 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15204 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15205 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15206 writer->pos += 2;
15207 pindex += 2;
15208 }
15209 }
15210
15211 /* Write characters */
15212 if (len) {
15213 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15214 str, pindex, len);
15215 writer->pos += len;
15216 }
15217
15218 /* Pad right with the fill character if needed */
15219 if (arg->width > len) {
15220 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015221 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015222 writer->pos += sublen;
15223 }
15224 return 0;
15225}
15226
15227/* Helper of PyUnicode_Format(): format one arg.
15228 Return 0 on success, raise an exception and return -1 on error. */
15229static int
15230unicode_format_arg(struct unicode_formatter_t *ctx)
15231{
15232 struct unicode_format_arg_t arg;
15233 PyObject *str;
15234 int ret;
15235
Victor Stinner8dbd4212012-12-04 09:30:24 +010015236 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015237 if (arg.ch == '%') {
15238 ctx->fmtpos++;
15239 ctx->fmtcnt--;
15240 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15241 return -1;
15242 return 0;
15243 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015244 arg.flags = 0;
15245 arg.width = -1;
15246 arg.prec = -1;
15247 arg.sign = 0;
15248 str = NULL;
15249
Victor Stinnera47082312012-10-04 02:19:54 +020015250 ret = unicode_format_arg_parse(ctx, &arg);
15251 if (ret == -1)
15252 return -1;
15253
15254 ret = unicode_format_arg_format(ctx, &arg, &str);
15255 if (ret == -1)
15256 return -1;
15257
15258 if (ret != 1) {
15259 ret = unicode_format_arg_output(ctx, &arg, str);
15260 Py_DECREF(str);
15261 if (ret == -1)
15262 return -1;
15263 }
15264
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015265 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015266 PyErr_SetString(PyExc_TypeError,
15267 "not all arguments converted during string formatting");
15268 return -1;
15269 }
15270 return 0;
15271}
15272
Alexander Belopolsky40018472011-02-26 01:02:56 +000015273PyObject *
15274PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015275{
Victor Stinnera47082312012-10-04 02:19:54 +020015276 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015277
Guido van Rossumd57fd912000-03-10 22:53:23 +000015278 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015279 PyErr_BadInternalCall();
15280 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015281 }
Victor Stinnera47082312012-10-04 02:19:54 +020015282
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015283 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015284 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015285
15286 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015287 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15288 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15289 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15290 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015291
Victor Stinner8f674cc2013-04-17 23:02:17 +020015292 _PyUnicodeWriter_Init(&ctx.writer);
15293 ctx.writer.min_length = ctx.fmtcnt + 100;
15294 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015295
Guido van Rossumd57fd912000-03-10 22:53:23 +000015296 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015297 ctx.arglen = PyTuple_Size(args);
15298 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015299 }
15300 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015301 ctx.arglen = -1;
15302 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015303 }
Victor Stinnera47082312012-10-04 02:19:54 +020015304 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015305 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015306 ctx.dict = args;
15307 else
15308 ctx.dict = NULL;
15309 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015310
Victor Stinnera47082312012-10-04 02:19:54 +020015311 while (--ctx.fmtcnt >= 0) {
15312 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015313 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015314
15315 nonfmtpos = ctx.fmtpos++;
15316 while (ctx.fmtcnt >= 0 &&
15317 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15318 ctx.fmtpos++;
15319 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015320 }
Victor Stinnera47082312012-10-04 02:19:54 +020015321 if (ctx.fmtcnt < 0) {
15322 ctx.fmtpos--;
15323 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015324 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015325
Victor Stinnercfc4c132013-04-03 01:48:39 +020015326 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15327 nonfmtpos, ctx.fmtpos) < 0)
15328 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015329 }
15330 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015331 ctx.fmtpos++;
15332 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015333 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015334 }
15335 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015336
Victor Stinnera47082312012-10-04 02:19:54 +020015337 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015338 PyErr_SetString(PyExc_TypeError,
15339 "not all arguments converted during string formatting");
15340 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015341 }
15342
Victor Stinnera47082312012-10-04 02:19:54 +020015343 if (ctx.args_owned) {
15344 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015345 }
Victor Stinnera47082312012-10-04 02:19:54 +020015346 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015347
Benjamin Peterson29060642009-01-31 22:14:21 +000015348 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015349 _PyUnicodeWriter_Dealloc(&ctx.writer);
15350 if (ctx.args_owned) {
15351 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015352 }
15353 return NULL;
15354}
15355
Jeremy Hylton938ace62002-07-17 16:30:39 +000015356static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015357unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15358
Tim Peters6d6c1a32001-08-02 04:15:00 +000015359static PyObject *
15360unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15361{
Benjamin Peterson29060642009-01-31 22:14:21 +000015362 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015363 static char *kwlist[] = {"object", "encoding", "errors", 0};
15364 char *encoding = NULL;
15365 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015366
Benjamin Peterson14339b62009-01-31 16:36:08 +000015367 if (type != &PyUnicode_Type)
15368 return unicode_subtype_new(type, args, kwds);
15369 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015370 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015371 return NULL;
15372 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015373 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015374 if (encoding == NULL && errors == NULL)
15375 return PyObject_Str(x);
15376 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015377 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015378}
15379
Guido van Rossume023fe02001-08-30 03:12:59 +000015380static PyObject *
15381unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15382{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015383 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015384 Py_ssize_t length, char_size;
15385 int share_wstr, share_utf8;
15386 unsigned int kind;
15387 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015388
Benjamin Peterson14339b62009-01-31 16:36:08 +000015389 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015390
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015391 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015392 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015393 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015394 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015395 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015396 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015397 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015398 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015399
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015400 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015401 if (self == NULL) {
15402 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015403 return NULL;
15404 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015405 kind = PyUnicode_KIND(unicode);
15406 length = PyUnicode_GET_LENGTH(unicode);
15407
15408 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015409#ifdef Py_DEBUG
15410 _PyUnicode_HASH(self) = -1;
15411#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015412 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015413#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015414 _PyUnicode_STATE(self).interned = 0;
15415 _PyUnicode_STATE(self).kind = kind;
15416 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015417 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015418 _PyUnicode_STATE(self).ready = 1;
15419 _PyUnicode_WSTR(self) = NULL;
15420 _PyUnicode_UTF8_LENGTH(self) = 0;
15421 _PyUnicode_UTF8(self) = NULL;
15422 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015423 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015424
15425 share_utf8 = 0;
15426 share_wstr = 0;
15427 if (kind == PyUnicode_1BYTE_KIND) {
15428 char_size = 1;
15429 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15430 share_utf8 = 1;
15431 }
15432 else if (kind == PyUnicode_2BYTE_KIND) {
15433 char_size = 2;
15434 if (sizeof(wchar_t) == 2)
15435 share_wstr = 1;
15436 }
15437 else {
15438 assert(kind == PyUnicode_4BYTE_KIND);
15439 char_size = 4;
15440 if (sizeof(wchar_t) == 4)
15441 share_wstr = 1;
15442 }
15443
15444 /* Ensure we won't overflow the length. */
15445 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15446 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015447 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015448 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015449 data = PyObject_MALLOC((length + 1) * char_size);
15450 if (data == NULL) {
15451 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015452 goto onError;
15453 }
15454
Victor Stinnerc3c74152011-10-02 20:39:55 +020015455 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015456 if (share_utf8) {
15457 _PyUnicode_UTF8_LENGTH(self) = length;
15458 _PyUnicode_UTF8(self) = data;
15459 }
15460 if (share_wstr) {
15461 _PyUnicode_WSTR_LENGTH(self) = length;
15462 _PyUnicode_WSTR(self) = (wchar_t *)data;
15463 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015464
Christian Heimesf051e432016-09-13 20:22:02 +020015465 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015466 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015467 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015468#ifdef Py_DEBUG
15469 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15470#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015471 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015472 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015473
15474onError:
15475 Py_DECREF(unicode);
15476 Py_DECREF(self);
15477 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015478}
15479
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015480PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015481"str(object='') -> str\n\
15482str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015483\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015484Create a new string object from the given object. If encoding or\n\
15485errors is specified, then the object must expose a data buffer\n\
15486that will be decoded using the given encoding and error handler.\n\
15487Otherwise, returns the result of object.__str__() (if defined)\n\
15488or repr(object).\n\
15489encoding defaults to sys.getdefaultencoding().\n\
15490errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015491
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015492static PyObject *unicode_iter(PyObject *seq);
15493
Guido van Rossumd57fd912000-03-10 22:53:23 +000015494PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015495 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015496 "str", /* tp_name */
15497 sizeof(PyUnicodeObject), /* tp_basicsize */
15498 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015499 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015500 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015501 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015502 0, /* tp_getattr */
15503 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015504 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015505 unicode_repr, /* tp_repr */
15506 &unicode_as_number, /* tp_as_number */
15507 &unicode_as_sequence, /* tp_as_sequence */
15508 &unicode_as_mapping, /* tp_as_mapping */
15509 (hashfunc) unicode_hash, /* tp_hash*/
15510 0, /* tp_call*/
15511 (reprfunc) unicode_str, /* tp_str */
15512 PyObject_GenericGetAttr, /* tp_getattro */
15513 0, /* tp_setattro */
15514 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015515 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015516 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15517 unicode_doc, /* tp_doc */
15518 0, /* tp_traverse */
15519 0, /* tp_clear */
15520 PyUnicode_RichCompare, /* tp_richcompare */
15521 0, /* tp_weaklistoffset */
15522 unicode_iter, /* tp_iter */
15523 0, /* tp_iternext */
15524 unicode_methods, /* tp_methods */
15525 0, /* tp_members */
15526 0, /* tp_getset */
15527 &PyBaseObject_Type, /* tp_base */
15528 0, /* tp_dict */
15529 0, /* tp_descr_get */
15530 0, /* tp_descr_set */
15531 0, /* tp_dictoffset */
15532 0, /* tp_init */
15533 0, /* tp_alloc */
15534 unicode_new, /* tp_new */
15535 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015536};
15537
15538/* Initialize the Unicode implementation */
15539
Victor Stinner331a6a52019-05-27 16:39:22 +020015540PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015541_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015542{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015543 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015544 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015545 0x000A, /* LINE FEED */
15546 0x000D, /* CARRIAGE RETURN */
15547 0x001C, /* FILE SEPARATOR */
15548 0x001D, /* GROUP SEPARATOR */
15549 0x001E, /* RECORD SEPARATOR */
15550 0x0085, /* NEXT LINE */
15551 0x2028, /* LINE SEPARATOR */
15552 0x2029, /* PARAGRAPH SEPARATOR */
15553 };
15554
Fred Drakee4315f52000-05-09 19:53:39 +000015555 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015556 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015557 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015558 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015559 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015560 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015561
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015562 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015563 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015564 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015565
15566 /* initialize the linebreak bloom filter */
15567 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015568 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015569 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015570
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015571 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015572 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015573 }
15574 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015575 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015576 }
15577 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015578 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015579 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015580 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015581}
15582
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015583
Walter Dörwald16807132007-05-25 13:52:07 +000015584void
15585PyUnicode_InternInPlace(PyObject **p)
15586{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015587 PyObject *s = *p;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015588#ifdef Py_DEBUG
15589 assert(s != NULL);
15590 assert(_PyUnicode_CHECK(s));
15591#else
Victor Stinner607b1022020-05-05 18:50:30 +020015592 if (s == NULL || !PyUnicode_Check(s)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020015593 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015594 }
Victor Stinner4fae54c2011-10-03 02:01:52 +020015595#endif
Victor Stinner607b1022020-05-05 18:50:30 +020015596
Benjamin Peterson14339b62009-01-31 16:36:08 +000015597 /* If it's a subclass, we don't really know what putting
15598 it in the interned dict might do. */
Victor Stinner607b1022020-05-05 18:50:30 +020015599 if (!PyUnicode_CheckExact(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015600 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015601 }
15602
15603 if (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015604 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015605 }
15606
15607#ifdef INTERNED_STRINGS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015608 if (interned == NULL) {
15609 interned = PyDict_New();
15610 if (interned == NULL) {
15611 PyErr_Clear(); /* Don't leave an exception */
15612 return;
15613 }
15614 }
Victor Stinner607b1022020-05-05 18:50:30 +020015615
15616 PyObject *t;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015617 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015618 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015619 Py_END_ALLOW_RECURSION
Victor Stinner607b1022020-05-05 18:50:30 +020015620
Berker Peksagced8d4c2016-07-25 04:40:39 +030015621 if (t == NULL) {
15622 PyErr_Clear();
15623 return;
15624 }
Victor Stinner607b1022020-05-05 18:50:30 +020015625
Berker Peksagced8d4c2016-07-25 04:40:39 +030015626 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015627 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015628 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015629 return;
15630 }
Victor Stinner607b1022020-05-05 18:50:30 +020015631
Benjamin Peterson14339b62009-01-31 16:36:08 +000015632 /* The two references in interned are not counted by refcnt.
15633 The deallocator will take care of this */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015634 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015635 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Victor Stinner607b1022020-05-05 18:50:30 +020015636#endif
Walter Dörwald16807132007-05-25 13:52:07 +000015637}
15638
15639void
15640PyUnicode_InternImmortal(PyObject **p)
15641{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015642 PyUnicode_InternInPlace(p);
15643 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015644 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015645 Py_INCREF(*p);
15646 }
Walter Dörwald16807132007-05-25 13:52:07 +000015647}
15648
15649PyObject *
15650PyUnicode_InternFromString(const char *cp)
15651{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015652 PyObject *s = PyUnicode_FromString(cp);
15653 if (s == NULL)
15654 return NULL;
15655 PyUnicode_InternInPlace(&s);
15656 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015657}
15658
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015659
15660#if defined(WITH_VALGRIND) || defined(__INSURE__)
15661static void
15662unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015663{
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015664 if (interned == NULL || !PyDict_Check(interned)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015665 return;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015666 }
15667 PyObject *keys = PyDict_Keys(interned);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015668 if (keys == NULL || !PyList_Check(keys)) {
15669 PyErr_Clear();
15670 return;
15671 }
Walter Dörwald16807132007-05-25 13:52:07 +000015672
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015673 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015674 detector, interned unicode strings are not forcibly deallocated;
15675 rather, we give them their stolen references back, and then clear
15676 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015677
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015678 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015679#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015680 fprintf(stderr, "releasing %zd interned strings\n", n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015681
15682 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015683#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015684 for (Py_ssize_t i = 0; i < n; i++) {
15685 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015686 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015687 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015688 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015689 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015690 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015691 Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015692#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015693 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015694#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015695 break;
15696 case SSTATE_INTERNED_MORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015697 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015698#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015699 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015700#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015701 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015702 case SSTATE_NOT_INTERNED:
15703 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015704 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015705 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015706 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015707 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015708 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015709#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015710 fprintf(stderr,
15711 "total size of all interned strings: %zd/%zd mortal/immortal\n",
15712 mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015713#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015714 Py_DECREF(keys);
15715 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015716 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015717}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015718#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015719
15720
15721/********************* Unicode Iterator **************************/
15722
15723typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015724 PyObject_HEAD
15725 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015726 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015727} unicodeiterobject;
15728
15729static void
15730unicodeiter_dealloc(unicodeiterobject *it)
15731{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015732 _PyObject_GC_UNTRACK(it);
15733 Py_XDECREF(it->it_seq);
15734 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015735}
15736
15737static int
15738unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15739{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015740 Py_VISIT(it->it_seq);
15741 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015742}
15743
15744static PyObject *
15745unicodeiter_next(unicodeiterobject *it)
15746{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015747 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015748
Benjamin Peterson14339b62009-01-31 16:36:08 +000015749 assert(it != NULL);
15750 seq = it->it_seq;
15751 if (seq == NULL)
15752 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015753 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015755 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15756 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015757 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015758 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15759 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015760 if (item != NULL)
15761 ++it->it_index;
15762 return item;
15763 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015764
Benjamin Peterson14339b62009-01-31 16:36:08 +000015765 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015766 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015767 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015768}
15769
15770static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015771unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015772{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015773 Py_ssize_t len = 0;
15774 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015775 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015776 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015777}
15778
15779PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15780
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015781static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015782unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015783{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015784 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015785 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015786 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015787 it->it_seq, it->it_index);
15788 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015789 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015790 if (u == NULL)
15791 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015792 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015793 }
15794}
15795
15796PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15797
15798static PyObject *
15799unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15800{
15801 Py_ssize_t index = PyLong_AsSsize_t(state);
15802 if (index == -1 && PyErr_Occurred())
15803 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015804 if (it->it_seq != NULL) {
15805 if (index < 0)
15806 index = 0;
15807 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15808 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15809 it->it_index = index;
15810 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015811 Py_RETURN_NONE;
15812}
15813
15814PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15815
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015816static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015817 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015818 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015819 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15820 reduce_doc},
15821 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15822 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015823 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015824};
15825
15826PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015827 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15828 "str_iterator", /* tp_name */
15829 sizeof(unicodeiterobject), /* tp_basicsize */
15830 0, /* tp_itemsize */
15831 /* methods */
15832 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015833 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015834 0, /* tp_getattr */
15835 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015836 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015837 0, /* tp_repr */
15838 0, /* tp_as_number */
15839 0, /* tp_as_sequence */
15840 0, /* tp_as_mapping */
15841 0, /* tp_hash */
15842 0, /* tp_call */
15843 0, /* tp_str */
15844 PyObject_GenericGetAttr, /* tp_getattro */
15845 0, /* tp_setattro */
15846 0, /* tp_as_buffer */
15847 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15848 0, /* tp_doc */
15849 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15850 0, /* tp_clear */
15851 0, /* tp_richcompare */
15852 0, /* tp_weaklistoffset */
15853 PyObject_SelfIter, /* tp_iter */
15854 (iternextfunc)unicodeiter_next, /* tp_iternext */
15855 unicodeiter_methods, /* tp_methods */
15856 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015857};
15858
15859static PyObject *
15860unicode_iter(PyObject *seq)
15861{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015862 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015863
Benjamin Peterson14339b62009-01-31 16:36:08 +000015864 if (!PyUnicode_Check(seq)) {
15865 PyErr_BadInternalCall();
15866 return NULL;
15867 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015868 if (PyUnicode_READY(seq) == -1)
15869 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015870 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15871 if (it == NULL)
15872 return NULL;
15873 it->it_index = 0;
15874 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015875 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015876 _PyObject_GC_TRACK(it);
15877 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015878}
15879
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015880
15881size_t
15882Py_UNICODE_strlen(const Py_UNICODE *u)
15883{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015884 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015885}
15886
15887Py_UNICODE*
15888Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15889{
15890 Py_UNICODE *u = s1;
15891 while ((*u++ = *s2++));
15892 return s1;
15893}
15894
15895Py_UNICODE*
15896Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15897{
15898 Py_UNICODE *u = s1;
15899 while ((*u++ = *s2++))
15900 if (n-- == 0)
15901 break;
15902 return s1;
15903}
15904
15905Py_UNICODE*
15906Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15907{
15908 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015909 u1 += wcslen(u1);
15910 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015911 return s1;
15912}
15913
15914int
15915Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15916{
15917 while (*s1 && *s2 && *s1 == *s2)
15918 s1++, s2++;
15919 if (*s1 && *s2)
15920 return (*s1 < *s2) ? -1 : +1;
15921 if (*s1)
15922 return 1;
15923 if (*s2)
15924 return -1;
15925 return 0;
15926}
15927
15928int
15929Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15930{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015931 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015932 for (; n != 0; n--) {
15933 u1 = *s1;
15934 u2 = *s2;
15935 if (u1 != u2)
15936 return (u1 < u2) ? -1 : +1;
15937 if (u1 == '\0')
15938 return 0;
15939 s1++;
15940 s2++;
15941 }
15942 return 0;
15943}
15944
15945Py_UNICODE*
15946Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15947{
15948 const Py_UNICODE *p;
15949 for (p = s; *p; p++)
15950 if (*p == c)
15951 return (Py_UNICODE*)p;
15952 return NULL;
15953}
15954
15955Py_UNICODE*
15956Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15957{
15958 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015959 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015960 while (p != s) {
15961 p--;
15962 if (*p == c)
15963 return (Py_UNICODE*)p;
15964 }
15965 return NULL;
15966}
Victor Stinner331ea922010-08-10 16:37:20 +000015967
Victor Stinner71133ff2010-09-01 23:43:53 +000015968Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015969PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015970{
Victor Stinner577db2c2011-10-11 22:12:48 +020015971 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015972 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015974 if (!PyUnicode_Check(unicode)) {
15975 PyErr_BadArgument();
15976 return NULL;
15977 }
Inada Naoki2c4928d2020-06-17 20:09:44 +090015978_Py_COMP_DIAG_PUSH
15979_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015980 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Inada Naoki2c4928d2020-06-17 20:09:44 +090015981_Py_COMP_DIAG_POP
Victor Stinner577db2c2011-10-11 22:12:48 +020015982 if (u == NULL)
15983 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015984 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015985 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015986 PyErr_NoMemory();
15987 return NULL;
15988 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015989 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015990 size *= sizeof(Py_UNICODE);
15991 copy = PyMem_Malloc(size);
15992 if (copy == NULL) {
15993 PyErr_NoMemory();
15994 return NULL;
15995 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015996 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015997 return copy;
15998}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015999
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016000
Victor Stinner709d23d2019-05-02 14:56:30 -040016001static int
16002encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016003{
Victor Stinner709d23d2019-05-02 14:56:30 -040016004 int res;
16005 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
16006 if (res == -2) {
16007 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
16008 return -1;
16009 }
16010 if (res < 0) {
16011 PyErr_NoMemory();
16012 return -1;
16013 }
16014 return 0;
16015}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016016
Victor Stinner709d23d2019-05-02 14:56:30 -040016017
16018static int
16019config_get_codec_name(wchar_t **config_encoding)
16020{
16021 char *encoding;
16022 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16023 return -1;
16024 }
16025
16026 PyObject *name_obj = NULL;
16027 PyObject *codec = _PyCodec_Lookup(encoding);
16028 PyMem_RawFree(encoding);
16029
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016030 if (!codec)
16031 goto error;
16032
16033 name_obj = PyObject_GetAttrString(codec, "name");
16034 Py_CLEAR(codec);
16035 if (!name_obj) {
16036 goto error;
16037 }
16038
Victor Stinner709d23d2019-05-02 14:56:30 -040016039 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16040 Py_DECREF(name_obj);
16041 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016042 goto error;
16043 }
16044
Victor Stinner709d23d2019-05-02 14:56:30 -040016045 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16046 if (raw_wname == NULL) {
16047 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016048 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040016049 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016050 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016051
16052 PyMem_RawFree(*config_encoding);
16053 *config_encoding = raw_wname;
16054
16055 PyMem_Free(wname);
16056 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016057
16058error:
16059 Py_XDECREF(codec);
16060 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040016061 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016062}
16063
16064
Victor Stinner331a6a52019-05-27 16:39:22 +020016065static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016066init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016067{
Victor Stinner709d23d2019-05-02 14:56:30 -040016068 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016069 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(tstate->interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016070 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016071 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016072 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016073 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016074 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016075}
16076
16077
Victor Stinner709d23d2019-05-02 14:56:30 -040016078static int
16079init_fs_codec(PyInterpreterState *interp)
16080{
Victor Stinnerda7933e2020-04-13 03:04:28 +020016081 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016082
16083 _Py_error_handler error_handler;
16084 error_handler = get_error_handler_wide(config->filesystem_errors);
16085 if (error_handler == _Py_ERROR_UNKNOWN) {
16086 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
16087 return -1;
16088 }
16089
16090 char *encoding, *errors;
16091 if (encode_wstr_utf8(config->filesystem_encoding,
16092 &encoding,
16093 "filesystem_encoding") < 0) {
16094 return -1;
16095 }
16096
16097 if (encode_wstr_utf8(config->filesystem_errors,
16098 &errors,
16099 "filesystem_errors") < 0) {
16100 PyMem_RawFree(encoding);
16101 return -1;
16102 }
16103
Victor Stinner3d17c042020-05-14 01:48:38 +020016104 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16105 PyMem_RawFree(fs_codec->encoding);
16106 fs_codec->encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016107 /* encoding has been normalized by init_fs_encoding() */
Victor Stinner3d17c042020-05-14 01:48:38 +020016108 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16109 PyMem_RawFree(fs_codec->errors);
16110 fs_codec->errors = errors;
16111 fs_codec->error_handler = error_handler;
Victor Stinner709d23d2019-05-02 14:56:30 -040016112
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016113#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +020016114 assert(fs_codec->utf8 == 1);
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016115#endif
16116
Victor Stinner709d23d2019-05-02 14:56:30 -040016117 /* At this point, PyUnicode_EncodeFSDefault() and
16118 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16119 the C implementation of the filesystem encoding. */
16120
16121 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16122 global configuration variables. */
Victor Stinner3d17c042020-05-14 01:48:38 +020016123 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16124 fs_codec->errors) < 0) {
Victor Stinner709d23d2019-05-02 14:56:30 -040016125 PyErr_NoMemory();
16126 return -1;
16127 }
16128 return 0;
16129}
16130
16131
Victor Stinner331a6a52019-05-27 16:39:22 +020016132static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016133init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016134{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016135 PyInterpreterState *interp = tstate->interp;
16136
Victor Stinner709d23d2019-05-02 14:56:30 -040016137 /* Update the filesystem encoding to the normalized Python codec name.
16138 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16139 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016140 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016141 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016142 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016143 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016144 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016145 }
16146
Victor Stinner709d23d2019-05-02 14:56:30 -040016147 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016148 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016149 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016150 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016151}
16152
16153
Victor Stinner331a6a52019-05-27 16:39:22 +020016154PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020016155_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016156{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016157 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016158 if (_PyStatus_EXCEPTION(status)) {
16159 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016160 }
16161
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016162 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016163}
16164
16165
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016166static void
Victor Stinner3d17c042020-05-14 01:48:38 +020016167_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016168{
Victor Stinner3d17c042020-05-14 01:48:38 +020016169 PyMem_RawFree(fs_codec->encoding);
16170 fs_codec->encoding = NULL;
16171 fs_codec->utf8 = 0;
16172 PyMem_RawFree(fs_codec->errors);
16173 fs_codec->errors = NULL;
16174 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016175}
16176
16177
Victor Stinner709d23d2019-05-02 14:56:30 -040016178#ifdef MS_WINDOWS
16179int
16180_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16181{
Victor Stinner81a7be32020-04-14 15:14:01 +020016182 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016183 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016184
16185 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16186 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16187 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16188 if (encoding == NULL || errors == NULL) {
16189 PyMem_RawFree(encoding);
16190 PyMem_RawFree(errors);
16191 PyErr_NoMemory();
16192 return -1;
16193 }
16194
16195 PyMem_RawFree(config->filesystem_encoding);
16196 config->filesystem_encoding = encoding;
16197 PyMem_RawFree(config->filesystem_errors);
16198 config->filesystem_errors = errors;
16199
16200 return init_fs_codec(interp);
16201}
16202#endif
16203
16204
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016205void
Victor Stinner3d483342019-11-22 12:27:50 +010016206_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016207{
Victor Stinner3d483342019-11-22 12:27:50 +010016208 if (_Py_IsMainInterpreter(tstate)) {
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016209#if defined(WITH_VALGRIND) || defined(__INSURE__)
Victor Stinner3d483342019-11-22 12:27:50 +010016210 /* Insure++ is a memory analysis tool that aids in discovering
16211 * memory leaks and other memory problems. On Python exit, the
16212 * interned string dictionaries are flagged as being in use at exit
16213 * (which it is). Under normal circumstances, this is fine because
16214 * the memory will be automatically reclaimed by the system. Under
16215 * memory debugging, it's a huge source of useless noise, so we
16216 * trade off slower shutdown for less distraction in the memory
16217 * reports. -baw
16218 */
16219 unicode_release_interned();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016220#endif /* __INSURE__ */
16221
Victor Stinner3d483342019-11-22 12:27:50 +010016222 Py_CLEAR(unicode_empty);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016223
Victor Stinner607b1022020-05-05 18:50:30 +020016224#ifdef LATIN1_SINGLETONS
Victor Stinner3d483342019-11-22 12:27:50 +010016225 for (Py_ssize_t i = 0; i < 256; i++) {
16226 Py_CLEAR(unicode_latin1[i]);
16227 }
Victor Stinner607b1022020-05-05 18:50:30 +020016228#endif
Victor Stinnerd6fb53f2020-05-14 01:11:54 +020016229 unicode_clear_static_strings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016230 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016231
Victor Stinner3d17c042020-05-14 01:48:38 +020016232 _PyUnicode_FiniEncodings(&tstate->interp->unicode.fs_codec);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016233}
16234
16235
Georg Brandl66c221e2010-10-14 07:04:07 +000016236/* A _string module, to export formatter_parser and formatter_field_name_split
16237 to the string.Formatter class implemented in Python. */
16238
16239static PyMethodDef _string_methods[] = {
16240 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16241 METH_O, PyDoc_STR("split the argument as a field name")},
16242 {"formatter_parser", (PyCFunction) formatter_parser,
16243 METH_O, PyDoc_STR("parse the argument as a format string")},
16244 {NULL, NULL}
16245};
16246
16247static struct PyModuleDef _string_module = {
16248 PyModuleDef_HEAD_INIT,
16249 "_string",
16250 PyDoc_STR("string helper module"),
16251 0,
16252 _string_methods,
16253 NULL,
16254 NULL,
16255 NULL,
16256 NULL
16257};
16258
16259PyMODINIT_FUNC
16260PyInit__string(void)
16261{
16262 return PyModule_Create(&_string_module);
16263}
16264
16265
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016266#ifdef __cplusplus
16267}
16268#endif