blob: e69bf01251ceddfe2deff9f37b57c690677613b1 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020043#include "pycore_abstract.h" // _PyIndex_Check()
Victor Stinner45876a92020-02-12 22:32:34 +010044#include "pycore_bytes_methods.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010045#include "pycore_fileutils.h"
Victor Stinner61691d82019-10-02 23:51:20 +020046#include "pycore_initconfig.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020047#include "pycore_interp.h" // PyInterpreterState.fs_codec
Victor Stinnerbcda8f12018-11-21 22:27:47 +010048#include "pycore_object.h"
Victor Stinner61691d82019-10-02 23:51:20 +020049#include "pycore_pathconfig.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040050#include "pycore_pylifecycle.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020051#include "pycore_pystate.h" // _PyInterpreterState_GET()
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000052#include "ucnhash.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070053#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000054
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000055#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000056#include <windows.h>
57#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000058
Victor Stinnerfecc4f22019-03-19 14:20:29 +010059/* Uncomment to display statistics on interned strings at exit when
60 using Valgrind or Insecure++. */
61/* #define INTERNED_STATS 1 */
62
63
Larry Hastings61272b72014-01-07 12:41:53 -080064/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090065class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080066[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090067/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
68
69/*[python input]
70class Py_UCS4_converter(CConverter):
71 type = 'Py_UCS4'
72 converter = 'convert_uc'
73
74 def converter_init(self):
75 if self.default is not unspecified:
76 self.c_default = ascii(self.default)
77 if len(self.c_default) > 4 or self.c_default[0] != "'":
78 self.c_default = hex(ord(self.default))
79
80[python start generated code]*/
81/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080082
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
Serhiy Storchaka05997252013-01-26 12:14:02 +020085NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner8faf8212011-12-08 22:14:11 +010096/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
97#define MAX_UNICODE 0x10ffff
98
Victor Stinner910337b2011-10-03 03:20:16 +020099#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200100# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#else
102# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
103#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200104
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200105#define _PyUnicode_UTF8(op) \
106 (((PyCompactUnicodeObject*)(op))->utf8)
107#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200108 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200109 assert(PyUnicode_IS_READY(op)), \
110 PyUnicode_IS_COMPACT_ASCII(op) ? \
111 ((char*)((PyASCIIObject*)(op) + 1)) : \
112 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200113#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 (((PyCompactUnicodeObject*)(op))->utf8_length)
115#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200116 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200117 assert(PyUnicode_IS_READY(op)), \
118 PyUnicode_IS_COMPACT_ASCII(op) ? \
119 ((PyASCIIObject*)(op))->length : \
120 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200121#define _PyUnicode_WSTR(op) \
122 (((PyASCIIObject*)(op))->wstr)
123#define _PyUnicode_WSTR_LENGTH(op) \
124 (((PyCompactUnicodeObject*)(op))->wstr_length)
125#define _PyUnicode_LENGTH(op) \
126 (((PyASCIIObject *)(op))->length)
127#define _PyUnicode_STATE(op) \
128 (((PyASCIIObject *)(op))->state)
129#define _PyUnicode_HASH(op) \
130 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_KIND(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200134#define _PyUnicode_GET_LENGTH(op) \
135 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200137#define _PyUnicode_DATA_ANY(op) \
138 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200139
Victor Stinner910337b2011-10-03 03:20:16 +0200140#undef PyUnicode_READY
141#define PyUnicode_READY(op) \
142 (assert(_PyUnicode_CHECK(op)), \
143 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200144 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100145 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200146
Victor Stinnerc379ead2011-10-03 12:52:27 +0200147#define _PyUnicode_SHARE_UTF8(op) \
148 (assert(_PyUnicode_CHECK(op)), \
149 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
150 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
151#define _PyUnicode_SHARE_WSTR(op) \
152 (assert(_PyUnicode_CHECK(op)), \
153 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
154
Victor Stinner829c0ad2011-10-03 01:08:02 +0200155/* true if the Unicode object has an allocated UTF-8 memory block
156 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200157#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200158 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200159 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200160 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
161
Victor Stinner03490912011-10-03 23:45:12 +0200162/* true if the Unicode object has an allocated wstr memory block
163 (not shared with other data) */
164#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200165 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200166 (!PyUnicode_IS_READY(op) || \
167 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
168
Victor Stinner910337b2011-10-03 03:20:16 +0200169/* Generic helper macro to convert characters of different types.
170 from_type and to_type have to be valid type names, begin and end
171 are pointers to the source characters which should be of type
172 "from_type *". to is a pointer of type "to_type *" and points to the
173 buffer where the result characters are written to. */
174#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
175 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100176 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600177 const from_type *_iter = (const from_type *)(begin);\
178 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200179 Py_ssize_t n = (_end) - (_iter); \
180 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200181 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200182 while (_iter < (_unrolled_end)) { \
183 _to[0] = (to_type) _iter[0]; \
184 _to[1] = (to_type) _iter[1]; \
185 _to[2] = (to_type) _iter[2]; \
186 _to[3] = (to_type) _iter[3]; \
187 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200188 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200189 while (_iter < (_end)) \
190 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200191 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200192
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200193#ifdef MS_WINDOWS
194 /* On Windows, overallocate by 50% is the best factor */
195# define OVERALLOCATE_FACTOR 2
196#else
197 /* On Linux, overallocate by 25% is the best factor */
198# define OVERALLOCATE_FACTOR 4
199#endif
200
Victor Stinner607b1022020-05-05 18:50:30 +0200201/* bpo-40521: Interned strings are shared by all interpreters. */
202#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
203# define INTERNED_STRINGS
204#endif
205
Walter Dörwald16807132007-05-25 13:52:07 +0000206/* This dictionary holds all interned unicode strings. Note that references
207 to strings in this dictionary are *not* counted in the string's ob_refcnt.
208 When the interned string reaches a refcnt of 0 the string deallocation
209 function will delete the reference from this dictionary.
210
211 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000212 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000213*/
Victor Stinner607b1022020-05-05 18:50:30 +0200214#ifdef INTERNED_STRINGS
Serhiy Storchaka05997252013-01-26 12:14:02 +0200215static PyObject *interned = NULL;
Victor Stinner607b1022020-05-05 18:50:30 +0200216#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000217
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000218/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200219static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200220
Serhiy Storchaka678db842013-01-26 12:16:36 +0200221#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222 do { \
223 if (unicode_empty != NULL) \
224 Py_INCREF(unicode_empty); \
225 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200226 unicode_empty = PyUnicode_New(0, 0); \
227 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200228 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200229 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
230 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200231 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200232 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000233
Serhiy Storchaka678db842013-01-26 12:16:36 +0200234#define _Py_RETURN_UNICODE_EMPTY() \
235 do { \
236 _Py_INCREF_UNICODE_EMPTY(); \
237 return unicode_empty; \
238 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000239
Victor Stinner59423e32018-11-26 13:40:01 +0100240static inline void
241unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
242 Py_ssize_t start, Py_ssize_t length)
243{
244 assert(0 <= start);
245 assert(kind != PyUnicode_WCHAR_KIND);
246 switch (kind) {
247 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100248 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100249 Py_UCS1 ch = (unsigned char)value;
250 Py_UCS1 *to = (Py_UCS1 *)data + start;
251 memset(to, ch, length);
252 break;
253 }
254 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100255 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100256 Py_UCS2 ch = (Py_UCS2)value;
257 Py_UCS2 *to = (Py_UCS2 *)data + start;
258 const Py_UCS2 *end = to + length;
259 for (; to < end; ++to) *to = ch;
260 break;
261 }
262 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100263 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100264 Py_UCS4 ch = value;
265 Py_UCS4 * to = (Py_UCS4 *)data + start;
266 const Py_UCS4 *end = to + length;
267 for (; to < end; ++to) *to = ch;
268 break;
269 }
270 default: Py_UNREACHABLE();
271 }
272}
273
274
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200275/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700276static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200277_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900278static inline void
279_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400280static PyObject *
281unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
282 const char *errors);
283static PyObject *
284unicode_decode_utf8(const char *s, Py_ssize_t size,
285 _Py_error_handler error_handler, const char *errors,
286 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200287
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200288/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200289static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200290
Victor Stinner607b1022020-05-05 18:50:30 +0200291/* bpo-40521: Latin1 singletons are shared by all interpreters. */
292#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
293# define LATIN1_SINGLETONS
294#endif
295
296#ifdef LATIN1_SINGLETONS
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000297/* Single character Unicode strings in the Latin-1 range are being
298 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200299static PyObject *unicode_latin1[256] = {NULL};
Victor Stinner607b1022020-05-05 18:50:30 +0200300#endif
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000301
Christian Heimes190d79e2008-01-30 11:58:22 +0000302/* Fast detection of the most frequent whitespace characters */
303const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000304 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000305/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000306/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000307/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000308/* case 0x000C: * FORM FEED */
309/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000310 0, 1, 1, 1, 1, 1, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000312/* case 0x001C: * FILE SEPARATOR */
313/* case 0x001D: * GROUP SEPARATOR */
314/* case 0x001E: * RECORD SEPARATOR */
315/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000316 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000317/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000318 1, 0, 0, 0, 0, 0, 0, 0,
319 0, 0, 0, 0, 0, 0, 0, 0,
320 0, 0, 0, 0, 0, 0, 0, 0,
321 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000322
Benjamin Peterson14339b62009-01-31 16:36:08 +0000323 0, 0, 0, 0, 0, 0, 0, 0,
324 0, 0, 0, 0, 0, 0, 0, 0,
325 0, 0, 0, 0, 0, 0, 0, 0,
326 0, 0, 0, 0, 0, 0, 0, 0,
327 0, 0, 0, 0, 0, 0, 0, 0,
328 0, 0, 0, 0, 0, 0, 0, 0,
329 0, 0, 0, 0, 0, 0, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000331};
332
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200333/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200334static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200335static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100336static int unicode_modifiable(PyObject *unicode);
337
Victor Stinnerfe226c02011-10-03 03:52:20 +0200338
Alexander Belopolsky40018472011-02-26 01:02:56 +0000339static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100340_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200341static PyObject *
342_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
343static PyObject *
344_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
345
346static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000347unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000348 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100349 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000350 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
351
Alexander Belopolsky40018472011-02-26 01:02:56 +0000352static void
353raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300354 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100355 PyObject *unicode,
356 Py_ssize_t startpos, Py_ssize_t endpos,
357 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000358
Christian Heimes190d79e2008-01-30 11:58:22 +0000359/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200360static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000361 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000362/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000363/* 0x000B, * LINE TABULATION */
364/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000365/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000366 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000367 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000368/* 0x001C, * FILE SEPARATOR */
369/* 0x001D, * GROUP SEPARATOR */
370/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000371 0, 0, 0, 0, 1, 1, 1, 0,
372 0, 0, 0, 0, 0, 0, 0, 0,
373 0, 0, 0, 0, 0, 0, 0, 0,
374 0, 0, 0, 0, 0, 0, 0, 0,
375 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000376
Benjamin Peterson14339b62009-01-31 16:36:08 +0000377 0, 0, 0, 0, 0, 0, 0, 0,
378 0, 0, 0, 0, 0, 0, 0, 0,
379 0, 0, 0, 0, 0, 0, 0, 0,
380 0, 0, 0, 0, 0, 0, 0, 0,
381 0, 0, 0, 0, 0, 0, 0, 0,
382 0, 0, 0, 0, 0, 0, 0, 0,
383 0, 0, 0, 0, 0, 0, 0, 0,
384 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000385};
386
INADA Naoki3ae20562017-01-16 20:41:20 +0900387static int convert_uc(PyObject *obj, void *addr);
388
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300389#include "clinic/unicodeobject.c.h"
390
Victor Stinner3d4226a2018-08-29 22:21:32 +0200391_Py_error_handler
392_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200393{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200394 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200395 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200396 }
397 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200398 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200399 }
400 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200401 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200402 }
403 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200404 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200405 }
406 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200407 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200408 }
409 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200410 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200411 }
412 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200413 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200414 }
Victor Stinner50149202015-09-22 00:26:54 +0200415 return _Py_ERROR_OTHER;
416}
417
Victor Stinner709d23d2019-05-02 14:56:30 -0400418
419static _Py_error_handler
420get_error_handler_wide(const wchar_t *errors)
421{
422 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
423 return _Py_ERROR_STRICT;
424 }
425 if (wcscmp(errors, L"surrogateescape") == 0) {
426 return _Py_ERROR_SURROGATEESCAPE;
427 }
428 if (wcscmp(errors, L"replace") == 0) {
429 return _Py_ERROR_REPLACE;
430 }
431 if (wcscmp(errors, L"ignore") == 0) {
432 return _Py_ERROR_IGNORE;
433 }
434 if (wcscmp(errors, L"backslashreplace") == 0) {
435 return _Py_ERROR_BACKSLASHREPLACE;
436 }
437 if (wcscmp(errors, L"surrogatepass") == 0) {
438 return _Py_ERROR_SURROGATEPASS;
439 }
440 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
441 return _Py_ERROR_XMLCHARREFREPLACE;
442 }
443 return _Py_ERROR_OTHER;
444}
445
446
Victor Stinner22eb6892019-06-26 00:51:05 +0200447static inline int
448unicode_check_encoding_errors(const char *encoding, const char *errors)
449{
450 if (encoding == NULL && errors == NULL) {
451 return 0;
452 }
453
Victor Stinner81a7be32020-04-14 15:14:01 +0200454 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200455#ifndef Py_DEBUG
456 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200457 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200458 return 0;
459 }
460#else
461 /* Always check in debug mode */
462#endif
463
464 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
465 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
Victor Stinner3d17c042020-05-14 01:48:38 +0200466 if (!interp->unicode.fs_codec.encoding) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200467 return 0;
468 }
469
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200470 /* Disable checks during Python finalization. For example, it allows to
471 call _PyObject_Dump() during finalization for debugging purpose. */
472 if (interp->finalizing) {
473 return 0;
474 }
475
Victor Stinner22eb6892019-06-26 00:51:05 +0200476 if (encoding != NULL) {
477 PyObject *handler = _PyCodec_Lookup(encoding);
478 if (handler == NULL) {
479 return -1;
480 }
481 Py_DECREF(handler);
482 }
483
484 if (errors != NULL) {
485 PyObject *handler = PyCodec_LookupError(errors);
486 if (handler == NULL) {
487 return -1;
488 }
489 Py_DECREF(handler);
490 }
491 return 0;
492}
493
494
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300495/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
496 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000497Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000498PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000499{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000500#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000501 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000502#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000503 /* This is actually an illegal character, so it should
504 not be passed to unichr. */
505 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000506#endif
507}
508
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200509int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100510_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200511{
Victor Stinner68762572019-10-07 18:42:01 +0200512#define CHECK(expr) \
513 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
514
Victor Stinner910337b2011-10-03 03:20:16 +0200515 PyASCIIObject *ascii;
516 unsigned int kind;
517
Victor Stinner68762572019-10-07 18:42:01 +0200518 assert(op != NULL);
519 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200520
521 ascii = (PyASCIIObject *)op;
522 kind = ascii->state.kind;
523
Victor Stinnera3b334d2011-10-03 13:53:37 +0200524 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200525 CHECK(kind == PyUnicode_1BYTE_KIND);
526 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200527 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200528 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200529 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200530 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200531
Victor Stinnera41463c2011-10-04 01:05:08 +0200532 if (ascii->state.compact == 1) {
533 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200534 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200535 || kind == PyUnicode_2BYTE_KIND
536 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200537 CHECK(ascii->state.ascii == 0);
538 CHECK(ascii->state.ready == 1);
539 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100540 }
541 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200542 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
543
544 data = unicode->data.any;
545 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200546 CHECK(ascii->length == 0);
547 CHECK(ascii->hash == -1);
548 CHECK(ascii->state.compact == 0);
549 CHECK(ascii->state.ascii == 0);
550 CHECK(ascii->state.ready == 0);
551 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
552 CHECK(ascii->wstr != NULL);
553 CHECK(data == NULL);
554 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200555 }
556 else {
Victor Stinner68762572019-10-07 18:42:01 +0200557 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200558 || kind == PyUnicode_2BYTE_KIND
559 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200560 CHECK(ascii->state.compact == 0);
561 CHECK(ascii->state.ready == 1);
562 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200563 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200564 CHECK(compact->utf8 == data);
565 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200566 }
567 else
Victor Stinner68762572019-10-07 18:42:01 +0200568 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200569 }
570 }
571 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200572 if (
573#if SIZEOF_WCHAR_T == 2
574 kind == PyUnicode_2BYTE_KIND
575#else
576 kind == PyUnicode_4BYTE_KIND
577#endif
578 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200579 {
Victor Stinner68762572019-10-07 18:42:01 +0200580 CHECK(ascii->wstr == data);
581 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200582 } else
Victor Stinner68762572019-10-07 18:42:01 +0200583 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200584 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200585
586 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200587 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200588 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200589 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200590 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200591
592 /* check that the best kind is used: O(n) operation */
593 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200594 Py_ssize_t i;
595 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300596 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200597 Py_UCS4 ch;
598
599 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200600 for (i=0; i < ascii->length; i++)
601 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200602 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200603 if (ch > maxchar)
604 maxchar = ch;
605 }
606 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100607 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200608 CHECK(maxchar >= 128);
609 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100610 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200611 else
Victor Stinner68762572019-10-07 18:42:01 +0200612 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200613 }
Victor Stinner77faf692011-11-20 18:56:05 +0100614 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200615 CHECK(maxchar >= 0x100);
616 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100617 }
618 else {
Victor Stinner68762572019-10-07 18:42:01 +0200619 CHECK(maxchar >= 0x10000);
620 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100621 }
Victor Stinner68762572019-10-07 18:42:01 +0200622 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200623 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400624 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200625
626#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400627}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200628
Victor Stinner910337b2011-10-03 03:20:16 +0200629
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100630static PyObject*
631unicode_result_wchar(PyObject *unicode)
632{
633#ifndef Py_DEBUG
634 Py_ssize_t len;
635
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100636 len = _PyUnicode_WSTR_LENGTH(unicode);
637 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100638 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200639 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100640 }
641
642 if (len == 1) {
643 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100644 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100645 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
646 Py_DECREF(unicode);
647 return latin1_char;
648 }
649 }
650
651 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200652 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100653 return NULL;
654 }
655#else
Victor Stinneraa771272012-10-04 02:32:58 +0200656 assert(Py_REFCNT(unicode) == 1);
657
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100658 /* don't make the result ready in debug mode to ensure that the caller
659 makes the string ready before using it */
660 assert(_PyUnicode_CheckConsistency(unicode, 1));
661#endif
662 return unicode;
663}
664
665static PyObject*
666unicode_result_ready(PyObject *unicode)
667{
668 Py_ssize_t length;
669
670 length = PyUnicode_GET_LENGTH(unicode);
671 if (length == 0) {
672 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100673 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200674 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100675 }
676 return unicode_empty;
677 }
678
Victor Stinner607b1022020-05-05 18:50:30 +0200679#ifdef LATIN1_SINGLETONS
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100680 if (length == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300681 const void *data = PyUnicode_DATA(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +0200682 int kind = PyUnicode_KIND(unicode);
683 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100684 if (ch < 256) {
685 PyObject *latin1_char = unicode_latin1[ch];
686 if (latin1_char != NULL) {
687 if (unicode != latin1_char) {
688 Py_INCREF(latin1_char);
689 Py_DECREF(unicode);
690 }
691 return latin1_char;
692 }
693 else {
694 assert(_PyUnicode_CheckConsistency(unicode, 1));
695 Py_INCREF(unicode);
696 unicode_latin1[ch] = unicode;
697 return unicode;
698 }
699 }
700 }
Victor Stinner607b1022020-05-05 18:50:30 +0200701#endif
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100702
703 assert(_PyUnicode_CheckConsistency(unicode, 1));
704 return unicode;
705}
706
707static PyObject*
708unicode_result(PyObject *unicode)
709{
710 assert(_PyUnicode_CHECK(unicode));
711 if (PyUnicode_IS_READY(unicode))
712 return unicode_result_ready(unicode);
713 else
714 return unicode_result_wchar(unicode);
715}
716
Victor Stinnerc4b49542011-12-11 22:44:26 +0100717static PyObject*
718unicode_result_unchanged(PyObject *unicode)
719{
720 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500721 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100722 return NULL;
723 Py_INCREF(unicode);
724 return unicode;
725 }
726 else
727 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100728 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100729}
730
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200731/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
732 ASCII, Latin1, UTF-8, etc. */
733static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200734backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200735 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
736{
Victor Stinnerad771582015-10-09 12:38:53 +0200737 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200738 Py_UCS4 ch;
739 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300740 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200741
742 assert(PyUnicode_IS_READY(unicode));
743 kind = PyUnicode_KIND(unicode);
744 data = PyUnicode_DATA(unicode);
745
746 size = 0;
747 /* determine replacement size */
748 for (i = collstart; i < collend; ++i) {
749 Py_ssize_t incr;
750
751 ch = PyUnicode_READ(kind, data, i);
752 if (ch < 0x100)
753 incr = 2+2;
754 else if (ch < 0x10000)
755 incr = 2+4;
756 else {
757 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200758 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200759 }
760 if (size > PY_SSIZE_T_MAX - incr) {
761 PyErr_SetString(PyExc_OverflowError,
762 "encoded result is too long for a Python string");
763 return NULL;
764 }
765 size += incr;
766 }
767
Victor Stinnerad771582015-10-09 12:38:53 +0200768 str = _PyBytesWriter_Prepare(writer, str, size);
769 if (str == NULL)
770 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200771
772 /* generate replacement */
773 for (i = collstart; i < collend; ++i) {
774 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200775 *str++ = '\\';
776 if (ch >= 0x00010000) {
777 *str++ = 'U';
778 *str++ = Py_hexdigits[(ch>>28)&0xf];
779 *str++ = Py_hexdigits[(ch>>24)&0xf];
780 *str++ = Py_hexdigits[(ch>>20)&0xf];
781 *str++ = Py_hexdigits[(ch>>16)&0xf];
782 *str++ = Py_hexdigits[(ch>>12)&0xf];
783 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200784 }
Victor Stinner797485e2015-10-09 03:17:30 +0200785 else if (ch >= 0x100) {
786 *str++ = 'u';
787 *str++ = Py_hexdigits[(ch>>12)&0xf];
788 *str++ = Py_hexdigits[(ch>>8)&0xf];
789 }
790 else
791 *str++ = 'x';
792 *str++ = Py_hexdigits[(ch>>4)&0xf];
793 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200794 }
795 return str;
796}
797
798/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
799 ASCII, Latin1, UTF-8, etc. */
800static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200801xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200802 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
803{
Victor Stinnerad771582015-10-09 12:38:53 +0200804 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200805 Py_UCS4 ch;
806 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300807 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200808
809 assert(PyUnicode_IS_READY(unicode));
810 kind = PyUnicode_KIND(unicode);
811 data = PyUnicode_DATA(unicode);
812
813 size = 0;
814 /* determine replacement size */
815 for (i = collstart; i < collend; ++i) {
816 Py_ssize_t incr;
817
818 ch = PyUnicode_READ(kind, data, i);
819 if (ch < 10)
820 incr = 2+1+1;
821 else if (ch < 100)
822 incr = 2+2+1;
823 else if (ch < 1000)
824 incr = 2+3+1;
825 else if (ch < 10000)
826 incr = 2+4+1;
827 else if (ch < 100000)
828 incr = 2+5+1;
829 else if (ch < 1000000)
830 incr = 2+6+1;
831 else {
832 assert(ch <= MAX_UNICODE);
833 incr = 2+7+1;
834 }
835 if (size > PY_SSIZE_T_MAX - incr) {
836 PyErr_SetString(PyExc_OverflowError,
837 "encoded result is too long for a Python string");
838 return NULL;
839 }
840 size += incr;
841 }
842
Victor Stinnerad771582015-10-09 12:38:53 +0200843 str = _PyBytesWriter_Prepare(writer, str, size);
844 if (str == NULL)
845 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200846
847 /* generate replacement */
848 for (i = collstart; i < collend; ++i) {
849 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
850 }
851 return str;
852}
853
Thomas Wouters477c8d52006-05-27 19:21:47 +0000854/* --- Bloom Filters ----------------------------------------------------- */
855
856/* stuff to implement simple "bloom filters" for Unicode characters.
857 to keep things simple, we use a single bitmask, using the least 5
858 bits from each unicode characters as the bit index. */
859
860/* the linebreak mask is set up by Unicode_Init below */
861
Antoine Pitrouf068f942010-01-13 14:19:12 +0000862#if LONG_BIT >= 128
863#define BLOOM_WIDTH 128
864#elif LONG_BIT >= 64
865#define BLOOM_WIDTH 64
866#elif LONG_BIT >= 32
867#define BLOOM_WIDTH 32
868#else
869#error "LONG_BIT is smaller than 32"
870#endif
871
Thomas Wouters477c8d52006-05-27 19:21:47 +0000872#define BLOOM_MASK unsigned long
873
Serhiy Storchaka05997252013-01-26 12:14:02 +0200874static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000875
Antoine Pitrouf068f942010-01-13 14:19:12 +0000876#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000877
Benjamin Peterson29060642009-01-31 22:14:21 +0000878#define BLOOM_LINEBREAK(ch) \
879 ((ch) < 128U ? ascii_linebreak[(ch)] : \
880 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000881
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700882static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300883make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000884{
Victor Stinnera85af502013-04-09 21:53:54 +0200885#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
886 do { \
887 TYPE *data = (TYPE *)PTR; \
888 TYPE *end = data + LEN; \
889 Py_UCS4 ch; \
890 for (; data != end; data++) { \
891 ch = *data; \
892 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
893 } \
894 break; \
895 } while (0)
896
Thomas Wouters477c8d52006-05-27 19:21:47 +0000897 /* calculate simple bloom-style bitmask for a given unicode string */
898
Antoine Pitrouf068f942010-01-13 14:19:12 +0000899 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000900
901 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200902 switch (kind) {
903 case PyUnicode_1BYTE_KIND:
904 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
905 break;
906 case PyUnicode_2BYTE_KIND:
907 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
908 break;
909 case PyUnicode_4BYTE_KIND:
910 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
911 break;
912 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700913 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200914 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000915 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200916
917#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000918}
919
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300920static int
921ensure_unicode(PyObject *obj)
922{
923 if (!PyUnicode_Check(obj)) {
924 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200925 "must be str, not %.100s",
926 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300927 return -1;
928 }
929 return PyUnicode_READY(obj);
930}
931
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200932/* Compilation of templated routines */
933
934#include "stringlib/asciilib.h"
935#include "stringlib/fastsearch.h"
936#include "stringlib/partition.h"
937#include "stringlib/split.h"
938#include "stringlib/count.h"
939#include "stringlib/find.h"
940#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200941#include "stringlib/undef.h"
942
943#include "stringlib/ucs1lib.h"
944#include "stringlib/fastsearch.h"
945#include "stringlib/partition.h"
946#include "stringlib/split.h"
947#include "stringlib/count.h"
948#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300949#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200950#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200951#include "stringlib/undef.h"
952
953#include "stringlib/ucs2lib.h"
954#include "stringlib/fastsearch.h"
955#include "stringlib/partition.h"
956#include "stringlib/split.h"
957#include "stringlib/count.h"
958#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300959#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200960#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200961#include "stringlib/undef.h"
962
963#include "stringlib/ucs4lib.h"
964#include "stringlib/fastsearch.h"
965#include "stringlib/partition.h"
966#include "stringlib/split.h"
967#include "stringlib/count.h"
968#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300969#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200970#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200971#include "stringlib/undef.h"
972
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200973#include "stringlib/unicodedefs.h"
974#include "stringlib/fastsearch.h"
975#include "stringlib/count.h"
976#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100977#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200978
Guido van Rossumd57fd912000-03-10 22:53:23 +0000979/* --- Unicode Object ----------------------------------------------------- */
980
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700981static inline Py_ssize_t
982findchar(const void *s, int kind,
983 Py_ssize_t size, Py_UCS4 ch,
984 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200986 switch (kind) {
987 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200988 if ((Py_UCS1) ch != ch)
989 return -1;
990 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600991 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200992 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600993 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200994 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200995 if ((Py_UCS2) ch != ch)
996 return -1;
997 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600998 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200999 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001000 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001001 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001002 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001003 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001004 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001005 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001006 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07001007 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +02001008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001009}
1010
Victor Stinnerafffce42012-10-03 23:03:17 +02001011#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +00001012/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +02001013 earlier.
1014
1015 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1016 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1017 invalid character in Unicode 6.0. */
1018static void
1019unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1020{
1021 int kind = PyUnicode_KIND(unicode);
1022 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1023 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1024 if (length <= old_length)
1025 return;
1026 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1027}
1028#endif
1029
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030static PyObject*
1031resize_compact(PyObject *unicode, Py_ssize_t length)
1032{
1033 Py_ssize_t char_size;
1034 Py_ssize_t struct_size;
1035 Py_ssize_t new_size;
1036 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001037 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001038#ifdef Py_DEBUG
1039 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1040#endif
1041
Victor Stinner79891572012-05-03 13:43:07 +02001042 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001043 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001044 assert(PyUnicode_IS_COMPACT(unicode));
1045
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001046 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001047 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001048 struct_size = sizeof(PyASCIIObject);
1049 else
1050 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001051 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001052
Victor Stinnerfe226c02011-10-03 03:52:20 +02001053 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1054 PyErr_NoMemory();
1055 return NULL;
1056 }
1057 new_size = (struct_size + (length + 1) * char_size);
1058
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001059 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1060 PyObject_DEL(_PyUnicode_UTF8(unicode));
1061 _PyUnicode_UTF8(unicode) = NULL;
1062 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1063 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001064#ifdef Py_REF_DEBUG
1065 _Py_RefTotal--;
1066#endif
1067#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001068 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001069#endif
Victor Stinner84def372011-12-11 20:04:56 +01001070
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001071 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001072 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001073 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001074 PyErr_NoMemory();
1075 return NULL;
1076 }
Victor Stinner84def372011-12-11 20:04:56 +01001077 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001079
Victor Stinnerfe226c02011-10-03 03:52:20 +02001080 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001081 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001082 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001083 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001084 _PyUnicode_WSTR_LENGTH(unicode) = length;
1085 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001086 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1087 PyObject_DEL(_PyUnicode_WSTR(unicode));
1088 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001089 if (!PyUnicode_IS_ASCII(unicode))
1090 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001091 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001092#ifdef Py_DEBUG
1093 unicode_fill_invalid(unicode, old_length);
1094#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001095 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1096 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001097 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001098 return unicode;
1099}
1100
Alexander Belopolsky40018472011-02-26 01:02:56 +00001101static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001102resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103{
Victor Stinner95663112011-10-04 01:03:50 +02001104 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001105 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001106 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001107 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001108
Victor Stinnerfe226c02011-10-03 03:52:20 +02001109 if (PyUnicode_IS_READY(unicode)) {
1110 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001111 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001112 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001113#ifdef Py_DEBUG
1114 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1115#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001116
1117 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001118 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001119 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1120 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001121
1122 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1123 PyErr_NoMemory();
1124 return -1;
1125 }
1126 new_size = (length + 1) * char_size;
1127
Victor Stinner7a9105a2011-12-12 00:13:42 +01001128 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1129 {
1130 PyObject_DEL(_PyUnicode_UTF8(unicode));
1131 _PyUnicode_UTF8(unicode) = NULL;
1132 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1133 }
1134
Victor Stinnerfe226c02011-10-03 03:52:20 +02001135 data = (PyObject *)PyObject_REALLOC(data, new_size);
1136 if (data == NULL) {
1137 PyErr_NoMemory();
1138 return -1;
1139 }
1140 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001141 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001142 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001143 _PyUnicode_WSTR_LENGTH(unicode) = length;
1144 }
1145 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001146 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001147 _PyUnicode_UTF8_LENGTH(unicode) = length;
1148 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001149 _PyUnicode_LENGTH(unicode) = length;
1150 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001151#ifdef Py_DEBUG
1152 unicode_fill_invalid(unicode, old_length);
1153#endif
Victor Stinner95663112011-10-04 01:03:50 +02001154 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001155 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001156 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001157 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001158 }
Victor Stinner95663112011-10-04 01:03:50 +02001159 assert(_PyUnicode_WSTR(unicode) != NULL);
1160
1161 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001162 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001163 PyErr_NoMemory();
1164 return -1;
1165 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001166 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001167 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001168 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001169 if (!wstr) {
1170 PyErr_NoMemory();
1171 return -1;
1172 }
1173 _PyUnicode_WSTR(unicode) = wstr;
1174 _PyUnicode_WSTR(unicode)[length] = 0;
1175 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001176 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 return 0;
1178}
1179
Victor Stinnerfe226c02011-10-03 03:52:20 +02001180static PyObject*
1181resize_copy(PyObject *unicode, Py_ssize_t length)
1182{
1183 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001184 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001185 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001186
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001187 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001188
1189 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1190 if (copy == NULL)
1191 return NULL;
1192
1193 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001194 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001195 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001196 }
1197 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001198 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001199
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001200 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001201 if (w == NULL)
1202 return NULL;
1203 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1204 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001205 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001206 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001207 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001208 }
1209}
1210
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001212 Ux0000 terminated; some code (e.g. new_identifier)
1213 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214
1215 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001216 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217
1218*/
1219
Alexander Belopolsky40018472011-02-26 01:02:56 +00001220static PyUnicodeObject *
1221_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001222{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001223 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225
Thomas Wouters477c8d52006-05-27 19:21:47 +00001226 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001227 if (length == 0 && unicode_empty != NULL) {
1228 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001229 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230 }
1231
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001232 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001233 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001234 return (PyUnicodeObject *)PyErr_NoMemory();
1235 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001236 if (length < 0) {
1237 PyErr_SetString(PyExc_SystemError,
1238 "Negative size passed to _PyUnicode_New");
1239 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 }
1241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1243 if (unicode == NULL)
1244 return NULL;
1245 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001246
1247 _PyUnicode_WSTR_LENGTH(unicode) = length;
1248 _PyUnicode_HASH(unicode) = -1;
1249 _PyUnicode_STATE(unicode).interned = 0;
1250 _PyUnicode_STATE(unicode).kind = 0;
1251 _PyUnicode_STATE(unicode).compact = 0;
1252 _PyUnicode_STATE(unicode).ready = 0;
1253 _PyUnicode_STATE(unicode).ascii = 0;
1254 _PyUnicode_DATA_ANY(unicode) = NULL;
1255 _PyUnicode_LENGTH(unicode) = 0;
1256 _PyUnicode_UTF8(unicode) = NULL;
1257 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001259 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1260 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001261 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001262 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001263 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001264 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001265
Jeremy Hyltond8082792003-09-16 19:41:39 +00001266 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001267 * the caller fails before initializing str -- unicode_resize()
1268 * reads str[0], and the Keep-Alive optimization can keep memory
1269 * allocated for str alive across a call to unicode_dealloc(unicode).
1270 * We don't want unicode_resize to read uninitialized memory in
1271 * that case.
1272 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001273 _PyUnicode_WSTR(unicode)[0] = 0;
1274 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001275
Victor Stinner7931d9a2011-11-04 00:22:48 +01001276 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277 return unicode;
1278}
1279
Victor Stinnerf42dc442011-10-02 23:33:16 +02001280static const char*
1281unicode_kind_name(PyObject *unicode)
1282{
Victor Stinner42dfd712011-10-03 14:41:45 +02001283 /* don't check consistency: unicode_kind_name() is called from
1284 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001285 if (!PyUnicode_IS_COMPACT(unicode))
1286 {
1287 if (!PyUnicode_IS_READY(unicode))
1288 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001289 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001290 {
1291 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001292 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001293 return "legacy ascii";
1294 else
1295 return "legacy latin1";
1296 case PyUnicode_2BYTE_KIND:
1297 return "legacy UCS2";
1298 case PyUnicode_4BYTE_KIND:
1299 return "legacy UCS4";
1300 default:
1301 return "<legacy invalid kind>";
1302 }
1303 }
1304 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001305 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001306 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001307 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001308 return "ascii";
1309 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001310 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001311 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001312 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001313 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001314 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001315 default:
1316 return "<invalid compact kind>";
1317 }
1318}
1319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001322const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001323 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001324 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001325}
1326
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001327const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001328 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001329 return _PyUnicode_COMPACT_DATA(unicode);
1330}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001331const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001332 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001333 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1335 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1336 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1337 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1338 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1339 return PyUnicode_DATA(unicode);
1340}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001341
1342void
1343_PyUnicode_Dump(PyObject *op)
1344{
1345 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001346 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1347 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001348 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001349
Victor Stinnera849a4b2011-10-03 12:12:11 +02001350 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001351 {
1352 if (ascii->state.ascii)
1353 data = (ascii + 1);
1354 else
1355 data = (compact + 1);
1356 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001357 else
1358 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001359 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1360 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001361
Victor Stinnera849a4b2011-10-03 12:12:11 +02001362 if (ascii->wstr == data)
1363 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001364 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001365
Victor Stinnera3b334d2011-10-03 13:53:37 +02001366 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001367 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001368 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1369 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001370 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001371 (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001372 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001373 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001374}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375#endif
1376
1377PyObject *
1378PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1379{
1380 PyObject *obj;
1381 PyCompactUnicodeObject *unicode;
1382 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001383 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001384 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 Py_ssize_t char_size;
1386 Py_ssize_t struct_size;
1387
1388 /* Optimization for empty strings */
1389 if (size == 0 && unicode_empty != NULL) {
1390 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001391 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392 }
1393
Victor Stinner9e9d6892011-10-04 01:02:02 +02001394 is_ascii = 0;
1395 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001396 struct_size = sizeof(PyCompactUnicodeObject);
1397 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001398 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 char_size = 1;
1400 is_ascii = 1;
1401 struct_size = sizeof(PyASCIIObject);
1402 }
1403 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001404 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405 char_size = 1;
1406 }
1407 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001408 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 char_size = 2;
1410 if (sizeof(wchar_t) == 2)
1411 is_sharing = 1;
1412 }
1413 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001414 if (maxchar > MAX_UNICODE) {
1415 PyErr_SetString(PyExc_SystemError,
1416 "invalid maximum character passed to PyUnicode_New");
1417 return NULL;
1418 }
Victor Stinner8f825062012-04-27 13:55:39 +02001419 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420 char_size = 4;
1421 if (sizeof(wchar_t) == 4)
1422 is_sharing = 1;
1423 }
1424
1425 /* Ensure we won't overflow the size. */
1426 if (size < 0) {
1427 PyErr_SetString(PyExc_SystemError,
1428 "Negative size passed to PyUnicode_New");
1429 return NULL;
1430 }
1431 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1432 return PyErr_NoMemory();
1433
1434 /* Duplicated allocation code from _PyObject_New() instead of a call to
1435 * PyObject_New() so we are able to allocate space for the object and
1436 * it's data buffer.
1437 */
1438 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1439 if (obj == NULL)
1440 return PyErr_NoMemory();
1441 obj = PyObject_INIT(obj, &PyUnicode_Type);
1442 if (obj == NULL)
1443 return NULL;
1444
1445 unicode = (PyCompactUnicodeObject *)obj;
1446 if (is_ascii)
1447 data = ((PyASCIIObject*)obj) + 1;
1448 else
1449 data = unicode + 1;
1450 _PyUnicode_LENGTH(unicode) = size;
1451 _PyUnicode_HASH(unicode) = -1;
1452 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001453 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001454 _PyUnicode_STATE(unicode).compact = 1;
1455 _PyUnicode_STATE(unicode).ready = 1;
1456 _PyUnicode_STATE(unicode).ascii = is_ascii;
1457 if (is_ascii) {
1458 ((char*)data)[size] = 0;
1459 _PyUnicode_WSTR(unicode) = NULL;
1460 }
Victor Stinner8f825062012-04-27 13:55:39 +02001461 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 ((char*)data)[size] = 0;
1463 _PyUnicode_WSTR(unicode) = NULL;
1464 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001466 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001467 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 else {
1469 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001470 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001471 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001473 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474 ((Py_UCS4*)data)[size] = 0;
1475 if (is_sharing) {
1476 _PyUnicode_WSTR_LENGTH(unicode) = size;
1477 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1478 }
1479 else {
1480 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1481 _PyUnicode_WSTR(unicode) = NULL;
1482 }
1483 }
Victor Stinner8f825062012-04-27 13:55:39 +02001484#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001485 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001486#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001487 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 return obj;
1489}
1490
1491#if SIZEOF_WCHAR_T == 2
1492/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1493 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001494 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001495
1496 This function assumes that unicode can hold one more code point than wstr
1497 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001498static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001499unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001500 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001501{
1502 const wchar_t *iter;
1503 Py_UCS4 *ucs4_out;
1504
Victor Stinner910337b2011-10-03 03:20:16 +02001505 assert(unicode != NULL);
1506 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001507 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1508 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1509
1510 for (iter = begin; iter < end; ) {
1511 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1512 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001513 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1514 && (iter+1) < end
1515 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001516 {
Victor Stinner551ac952011-11-29 22:58:13 +01001517 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001518 iter += 2;
1519 }
1520 else {
1521 *ucs4_out++ = *iter;
1522 iter++;
1523 }
1524 }
1525 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1526 _PyUnicode_GET_LENGTH(unicode)));
1527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001528}
1529#endif
1530
Victor Stinnercd9950f2011-10-02 00:34:53 +02001531static int
Victor Stinner488fa492011-12-12 00:01:39 +01001532unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001533{
Victor Stinner488fa492011-12-12 00:01:39 +01001534 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001535 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001536 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001537 return -1;
1538 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001539 return 0;
1540}
1541
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001542static int
1543_copy_characters(PyObject *to, Py_ssize_t to_start,
1544 PyObject *from, Py_ssize_t from_start,
1545 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001546{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001547 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001548 const void *from_data;
1549 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001550
Victor Stinneree4544c2012-05-09 22:24:08 +02001551 assert(0 <= how_many);
1552 assert(0 <= from_start);
1553 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001554 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001555 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001556 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001557
Victor Stinnerd3f08822012-05-29 12:57:52 +02001558 assert(PyUnicode_Check(to));
1559 assert(PyUnicode_IS_READY(to));
1560 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1561
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001562 if (how_many == 0)
1563 return 0;
1564
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001565 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001566 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001567 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001568 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001569
Victor Stinnerf1852262012-06-16 16:38:26 +02001570#ifdef Py_DEBUG
1571 if (!check_maxchar
1572 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1573 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001574 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001575 Py_UCS4 ch;
1576 Py_ssize_t i;
1577 for (i=0; i < how_many; i++) {
1578 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1579 assert(ch <= to_maxchar);
1580 }
1581 }
1582#endif
1583
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001584 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001585 if (check_maxchar
1586 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1587 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001588 /* Writing Latin-1 characters into an ASCII string requires to
1589 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001590 Py_UCS4 max_char;
1591 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001592 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001593 if (max_char >= 128)
1594 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001595 }
Christian Heimesf051e432016-09-13 20:22:02 +02001596 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001597 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001598 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001599 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001600 else if (from_kind == PyUnicode_1BYTE_KIND
1601 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001602 {
1603 _PyUnicode_CONVERT_BYTES(
1604 Py_UCS1, Py_UCS2,
1605 PyUnicode_1BYTE_DATA(from) + from_start,
1606 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1607 PyUnicode_2BYTE_DATA(to) + to_start
1608 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001609 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001610 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001611 && to_kind == PyUnicode_4BYTE_KIND)
1612 {
1613 _PyUnicode_CONVERT_BYTES(
1614 Py_UCS1, Py_UCS4,
1615 PyUnicode_1BYTE_DATA(from) + from_start,
1616 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1617 PyUnicode_4BYTE_DATA(to) + to_start
1618 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001619 }
1620 else if (from_kind == PyUnicode_2BYTE_KIND
1621 && to_kind == PyUnicode_4BYTE_KIND)
1622 {
1623 _PyUnicode_CONVERT_BYTES(
1624 Py_UCS2, Py_UCS4,
1625 PyUnicode_2BYTE_DATA(from) + from_start,
1626 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1627 PyUnicode_4BYTE_DATA(to) + to_start
1628 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001629 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001630 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001631 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1632
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001633 if (!check_maxchar) {
1634 if (from_kind == PyUnicode_2BYTE_KIND
1635 && to_kind == PyUnicode_1BYTE_KIND)
1636 {
1637 _PyUnicode_CONVERT_BYTES(
1638 Py_UCS2, Py_UCS1,
1639 PyUnicode_2BYTE_DATA(from) + from_start,
1640 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1641 PyUnicode_1BYTE_DATA(to) + to_start
1642 );
1643 }
1644 else if (from_kind == PyUnicode_4BYTE_KIND
1645 && to_kind == PyUnicode_1BYTE_KIND)
1646 {
1647 _PyUnicode_CONVERT_BYTES(
1648 Py_UCS4, Py_UCS1,
1649 PyUnicode_4BYTE_DATA(from) + from_start,
1650 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1651 PyUnicode_1BYTE_DATA(to) + to_start
1652 );
1653 }
1654 else if (from_kind == PyUnicode_4BYTE_KIND
1655 && to_kind == PyUnicode_2BYTE_KIND)
1656 {
1657 _PyUnicode_CONVERT_BYTES(
1658 Py_UCS4, Py_UCS2,
1659 PyUnicode_4BYTE_DATA(from) + from_start,
1660 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1661 PyUnicode_2BYTE_DATA(to) + to_start
1662 );
1663 }
1664 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001665 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001666 }
1667 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001668 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001669 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001670 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001671 Py_ssize_t i;
1672
Victor Stinnera0702ab2011-09-29 14:14:38 +02001673 for (i=0; i < how_many; i++) {
1674 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001675 if (ch > to_maxchar)
1676 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001677 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1678 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001679 }
1680 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001681 return 0;
1682}
1683
Victor Stinnerd3f08822012-05-29 12:57:52 +02001684void
1685_PyUnicode_FastCopyCharacters(
1686 PyObject *to, Py_ssize_t to_start,
1687 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001688{
1689 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1690}
1691
1692Py_ssize_t
1693PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1694 PyObject *from, Py_ssize_t from_start,
1695 Py_ssize_t how_many)
1696{
1697 int err;
1698
1699 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1700 PyErr_BadInternalCall();
1701 return -1;
1702 }
1703
Benjamin Petersonbac79492012-01-14 13:34:47 -05001704 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001705 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001706 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001707 return -1;
1708
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001709 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001710 PyErr_SetString(PyExc_IndexError, "string index out of range");
1711 return -1;
1712 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001713 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001714 PyErr_SetString(PyExc_IndexError, "string index out of range");
1715 return -1;
1716 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001717 if (how_many < 0) {
1718 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1719 return -1;
1720 }
1721 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001722 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1723 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001724 "Cannot write %zi characters at %zi "
1725 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001726 how_many, to_start, PyUnicode_GET_LENGTH(to));
1727 return -1;
1728 }
1729
1730 if (how_many == 0)
1731 return 0;
1732
Victor Stinner488fa492011-12-12 00:01:39 +01001733 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001734 return -1;
1735
1736 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1737 if (err) {
1738 PyErr_Format(PyExc_SystemError,
1739 "Cannot copy %s characters "
1740 "into a string of %s characters",
1741 unicode_kind_name(from),
1742 unicode_kind_name(to));
1743 return -1;
1744 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001745 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001746}
1747
Victor Stinner17222162011-09-28 22:15:37 +02001748/* Find the maximum code point and count the number of surrogate pairs so a
1749 correct string length can be computed before converting a string to UCS4.
1750 This function counts single surrogates as a character and not as a pair.
1751
1752 Return 0 on success, or -1 on error. */
1753static int
1754find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1755 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756{
1757 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001758 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001759
Victor Stinnerc53be962011-10-02 21:33:54 +02001760 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 *num_surrogates = 0;
1762 *maxchar = 0;
1763
1764 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001766 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1767 && (iter+1) < end
1768 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1769 {
1770 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1771 ++(*num_surrogates);
1772 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773 }
1774 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001776 {
1777 ch = *iter;
1778 iter++;
1779 }
1780 if (ch > *maxchar) {
1781 *maxchar = ch;
1782 if (*maxchar > MAX_UNICODE) {
1783 PyErr_Format(PyExc_ValueError,
1784 "character U+%x is not in range [U+0000; U+10ffff]",
1785 ch);
1786 return -1;
1787 }
1788 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789 }
1790 return 0;
1791}
1792
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001793int
1794_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001795{
1796 wchar_t *end;
1797 Py_UCS4 maxchar = 0;
1798 Py_ssize_t num_surrogates;
1799#if SIZEOF_WCHAR_T == 2
1800 Py_ssize_t length_wo_surrogates;
1801#endif
1802
Georg Brandl7597add2011-10-05 16:36:47 +02001803 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001804 strings were created using _PyObject_New() and where no canonical
1805 representation (the str field) has been set yet aka strings
1806 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001807 assert(_PyUnicode_CHECK(unicode));
1808 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001809 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001810 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001811 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001812 /* Actually, it should neither be interned nor be anything else: */
1813 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001815 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001816 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001817 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001819
1820 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001821 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1822 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 PyErr_NoMemory();
1824 return -1;
1825 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001826 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827 _PyUnicode_WSTR(unicode), end,
1828 PyUnicode_1BYTE_DATA(unicode));
1829 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1830 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1831 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1832 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001833 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001834 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001835 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001836 }
1837 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001838 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001839 _PyUnicode_UTF8(unicode) = NULL;
1840 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001841 }
1842 PyObject_FREE(_PyUnicode_WSTR(unicode));
1843 _PyUnicode_WSTR(unicode) = NULL;
1844 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1845 }
1846 /* In this case we might have to convert down from 4-byte native
1847 wchar_t to 2-byte unicode. */
1848 else if (maxchar < 65536) {
1849 assert(num_surrogates == 0 &&
1850 "FindMaxCharAndNumSurrogatePairs() messed up");
1851
Victor Stinner506f5922011-09-28 22:34:18 +02001852#if SIZEOF_WCHAR_T == 2
1853 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001854 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001855 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1856 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1857 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001858 _PyUnicode_UTF8(unicode) = NULL;
1859 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001860#else
1861 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001862 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001863 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001864 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001865 PyErr_NoMemory();
1866 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001867 }
Victor Stinner506f5922011-09-28 22:34:18 +02001868 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1869 _PyUnicode_WSTR(unicode), end,
1870 PyUnicode_2BYTE_DATA(unicode));
1871 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1872 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1873 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001874 _PyUnicode_UTF8(unicode) = NULL;
1875 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001876 PyObject_FREE(_PyUnicode_WSTR(unicode));
1877 _PyUnicode_WSTR(unicode) = NULL;
1878 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1879#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001880 }
1881 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1882 else {
1883#if SIZEOF_WCHAR_T == 2
1884 /* in case the native representation is 2-bytes, we need to allocate a
1885 new normalized 4-byte version. */
1886 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001887 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1888 PyErr_NoMemory();
1889 return -1;
1890 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001891 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1892 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001893 PyErr_NoMemory();
1894 return -1;
1895 }
1896 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1897 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001898 _PyUnicode_UTF8(unicode) = NULL;
1899 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001900 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1901 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001902 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001903 PyObject_FREE(_PyUnicode_WSTR(unicode));
1904 _PyUnicode_WSTR(unicode) = NULL;
1905 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1906#else
1907 assert(num_surrogates == 0);
1908
Victor Stinnerc3c74152011-10-02 20:39:55 +02001909 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001910 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001911 _PyUnicode_UTF8(unicode) = NULL;
1912 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001913 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1914#endif
1915 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1916 }
1917 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001918 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001919 return 0;
1920}
1921
Alexander Belopolsky40018472011-02-26 01:02:56 +00001922static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001923unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001924{
Walter Dörwald16807132007-05-25 13:52:07 +00001925 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001926 case SSTATE_NOT_INTERNED:
1927 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001928
Benjamin Peterson29060642009-01-31 22:14:21 +00001929 case SSTATE_INTERNED_MORTAL:
1930 /* revive dead object temporarily for DelItem */
Victor Stinnerc86a1122020-02-07 01:24:29 +01001931 Py_SET_REFCNT(unicode, 3);
Victor Stinner607b1022020-05-05 18:50:30 +02001932#ifdef INTERNED_STRINGS
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001933 if (PyDict_DelItem(interned, unicode) != 0) {
1934 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1935 NULL);
1936 }
Victor Stinner607b1022020-05-05 18:50:30 +02001937#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001938 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001939
Benjamin Peterson29060642009-01-31 22:14:21 +00001940 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001941 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1942 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001943
Benjamin Peterson29060642009-01-31 22:14:21 +00001944 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001945 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001946 }
1947
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001948 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001949 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001950 }
1951 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001952 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001953 }
1954 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001955 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001956 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001957
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001958 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001959}
1960
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001961#ifdef Py_DEBUG
1962static int
1963unicode_is_singleton(PyObject *unicode)
1964{
Victor Stinner607b1022020-05-05 18:50:30 +02001965 if (unicode == unicode_empty) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001966 return 1;
Victor Stinner607b1022020-05-05 18:50:30 +02001967 }
1968#ifdef LATIN1_SINGLETONS
1969 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001970 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1971 {
1972 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1973 if (ch < 256 && unicode_latin1[ch] == unicode)
1974 return 1;
1975 }
Victor Stinner607b1022020-05-05 18:50:30 +02001976#endif
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001977 return 0;
1978}
1979#endif
1980
Alexander Belopolsky40018472011-02-26 01:02:56 +00001981static int
Victor Stinner488fa492011-12-12 00:01:39 +01001982unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001983{
Victor Stinner488fa492011-12-12 00:01:39 +01001984 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001985 if (Py_REFCNT(unicode) != 1)
1986 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001987 if (_PyUnicode_HASH(unicode) != -1)
1988 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001989 if (PyUnicode_CHECK_INTERNED(unicode))
1990 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001991 if (!PyUnicode_CheckExact(unicode))
1992 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001993#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001994 /* singleton refcount is greater than 1 */
1995 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001996#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001997 return 1;
1998}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001999
Victor Stinnerfe226c02011-10-03 03:52:20 +02002000static int
2001unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2002{
2003 PyObject *unicode;
2004 Py_ssize_t old_length;
2005
2006 assert(p_unicode != NULL);
2007 unicode = *p_unicode;
2008
2009 assert(unicode != NULL);
2010 assert(PyUnicode_Check(unicode));
2011 assert(0 <= length);
2012
Victor Stinner910337b2011-10-03 03:20:16 +02002013 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002014 old_length = PyUnicode_WSTR_LENGTH(unicode);
2015 else
2016 old_length = PyUnicode_GET_LENGTH(unicode);
2017 if (old_length == length)
2018 return 0;
2019
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002020 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02002021 _Py_INCREF_UNICODE_EMPTY();
2022 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00002023 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002024 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002025 return 0;
2026 }
2027
Victor Stinner488fa492011-12-12 00:01:39 +01002028 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002029 PyObject *copy = resize_copy(unicode, length);
2030 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002031 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002032 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002033 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002034 }
2035
Victor Stinnerfe226c02011-10-03 03:52:20 +02002036 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002037 PyObject *new_unicode = resize_compact(unicode, length);
2038 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002039 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002040 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002041 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002042 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002043 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002044}
2045
Alexander Belopolsky40018472011-02-26 01:02:56 +00002046int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002047PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002048{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002049 PyObject *unicode;
2050 if (p_unicode == NULL) {
2051 PyErr_BadInternalCall();
2052 return -1;
2053 }
2054 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002055 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002056 {
2057 PyErr_BadInternalCall();
2058 return -1;
2059 }
2060 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002061}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002062
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002063/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002064
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002065 WARNING: The function doesn't copy the terminating null character and
2066 doesn't check the maximum character (may write a latin1 character in an
2067 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002068static void
2069unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2070 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002071{
2072 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002073 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002074 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002075
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002076 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002077 switch (kind) {
2078 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002079#ifdef Py_DEBUG
2080 if (PyUnicode_IS_ASCII(unicode)) {
2081 Py_UCS4 maxchar = ucs1lib_find_max_char(
2082 (const Py_UCS1*)str,
2083 (const Py_UCS1*)str + len);
2084 assert(maxchar < 128);
2085 }
2086#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002087 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002088 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002089 }
2090 case PyUnicode_2BYTE_KIND: {
2091 Py_UCS2 *start = (Py_UCS2 *)data + index;
2092 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002093
Victor Stinner184252a2012-06-16 02:57:41 +02002094 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002095 *ucs2 = (Py_UCS2)*str;
2096
2097 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002098 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002099 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002100 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002101 Py_UCS4 *start = (Py_UCS4 *)data + index;
2102 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002103
Victor Stinner184252a2012-06-16 02:57:41 +02002104 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002105 *ucs4 = (Py_UCS4)*str;
2106
2107 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002108 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002109 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002110 default:
2111 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002112 }
2113}
2114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002115static PyObject*
2116get_latin1_char(unsigned char ch)
2117{
Victor Stinner607b1022020-05-05 18:50:30 +02002118 PyObject *unicode;
2119
2120#ifdef LATIN1_SINGLETONS
2121 unicode = unicode_latin1[ch];
2122 if (unicode) {
2123 Py_INCREF(unicode);
2124 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002125 }
Victor Stinner607b1022020-05-05 18:50:30 +02002126#endif
2127
2128 unicode = PyUnicode_New(1, ch);
2129 if (!unicode) {
2130 return NULL;
2131 }
2132
2133 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2134 assert(_PyUnicode_CheckConsistency(unicode, 1));
2135
2136#ifdef LATIN1_SINGLETONS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002137 Py_INCREF(unicode);
Victor Stinner607b1022020-05-05 18:50:30 +02002138 unicode_latin1[ch] = unicode;
2139#endif
Victor Stinnera464fc12011-10-02 20:39:30 +02002140 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002141}
2142
Victor Stinner985a82a2014-01-03 12:53:47 +01002143static PyObject*
2144unicode_char(Py_UCS4 ch)
2145{
2146 PyObject *unicode;
2147
2148 assert(ch <= MAX_UNICODE);
2149
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002150 if (ch < 256)
2151 return get_latin1_char(ch);
2152
Victor Stinner985a82a2014-01-03 12:53:47 +01002153 unicode = PyUnicode_New(1, ch);
2154 if (unicode == NULL)
2155 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002156
2157 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2158 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002159 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002160 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002161 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2162 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2163 }
2164 assert(_PyUnicode_CheckConsistency(unicode, 1));
2165 return unicode;
2166}
2167
Alexander Belopolsky40018472011-02-26 01:02:56 +00002168PyObject *
2169PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002171 if (u == NULL)
2172 return (PyObject*)_PyUnicode_New(size);
2173
2174 if (size < 0) {
2175 PyErr_BadInternalCall();
2176 return NULL;
2177 }
2178
2179 return PyUnicode_FromWideChar(u, size);
2180}
2181
2182PyObject *
2183PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2184{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002185 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002186 Py_UCS4 maxchar = 0;
2187 Py_ssize_t num_surrogates;
2188
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002189 if (u == NULL && size != 0) {
2190 PyErr_BadInternalCall();
2191 return NULL;
2192 }
2193
2194 if (size == -1) {
2195 size = wcslen(u);
2196 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002198 /* If the Unicode data is known at construction time, we can apply
2199 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002200
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002201 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002202 if (size == 0)
2203 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002204
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 /* Single character Unicode objects in the Latin-1 range are
2206 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002207 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208 return get_latin1_char((unsigned char)*u);
2209
2210 /* If not empty and not single character, copy the Unicode data
2211 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002212 if (find_maxchar_surrogates(u, u + size,
2213 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002214 return NULL;
2215
Victor Stinner8faf8212011-12-08 22:14:11 +01002216 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217 if (!unicode)
2218 return NULL;
2219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 switch (PyUnicode_KIND(unicode)) {
2221 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002222 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2224 break;
2225 case PyUnicode_2BYTE_KIND:
2226#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002227 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002228#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002229 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002230 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2231#endif
2232 break;
2233 case PyUnicode_4BYTE_KIND:
2234#if SIZEOF_WCHAR_T == 2
2235 /* This is the only case which has to process surrogates, thus
2236 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002237 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238#else
2239 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002240 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002241#endif
2242 break;
2243 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002244 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002245 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002246
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002247 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248}
2249
Alexander Belopolsky40018472011-02-26 01:02:56 +00002250PyObject *
2251PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002252{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002253 if (size < 0) {
2254 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002255 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002256 return NULL;
2257 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002258 if (u != NULL)
2259 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2260 else
2261 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002262}
2263
Alexander Belopolsky40018472011-02-26 01:02:56 +00002264PyObject *
2265PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002266{
2267 size_t size = strlen(u);
2268 if (size > PY_SSIZE_T_MAX) {
2269 PyErr_SetString(PyExc_OverflowError, "input too long");
2270 return NULL;
2271 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002272 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002273}
2274
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002275PyObject *
2276_PyUnicode_FromId(_Py_Identifier *id)
2277{
Victor Stinner297257f2020-06-02 14:39:45 +02002278 if (id->object) {
2279 return id->object;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002280 }
Victor Stinner297257f2020-06-02 14:39:45 +02002281
2282 PyObject *obj;
2283 obj = PyUnicode_DecodeUTF8Stateful(id->string,
2284 strlen(id->string),
2285 NULL, NULL);
2286 if (!obj) {
2287 return NULL;
2288 }
2289 PyUnicode_InternInPlace(&obj);
2290
2291 assert(!id->next);
2292 id->object = obj;
2293 id->next = static_strings;
2294 static_strings = id;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002295 return id->object;
2296}
2297
Victor Stinnerd6fb53f2020-05-14 01:11:54 +02002298static void
2299unicode_clear_static_strings(void)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002300{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002301 _Py_Identifier *tmp, *s = static_strings;
2302 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002303 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002304 tmp = s->next;
2305 s->next = NULL;
2306 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002307 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002308 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002309}
2310
Benjamin Peterson0df54292012-03-26 14:50:32 -04002311/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002312
Victor Stinnerd3f08822012-05-29 12:57:52 +02002313PyObject*
2314_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002315{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002316 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002317 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002318 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002319#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002320 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002321#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002322 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002323 }
Victor Stinner785938e2011-12-11 20:09:03 +01002324 unicode = PyUnicode_New(size, 127);
2325 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002326 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002327 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2328 assert(_PyUnicode_CheckConsistency(unicode, 1));
2329 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002330}
2331
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002332static Py_UCS4
2333kind_maxchar_limit(unsigned int kind)
2334{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002335 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002336 case PyUnicode_1BYTE_KIND:
2337 return 0x80;
2338 case PyUnicode_2BYTE_KIND:
2339 return 0x100;
2340 case PyUnicode_4BYTE_KIND:
2341 return 0x10000;
2342 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002343 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002344 }
2345}
2346
Victor Stinner702c7342011-10-05 13:50:52 +02002347static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002348_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002349{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002350 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002351 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002352
Serhiy Storchaka678db842013-01-26 12:16:36 +02002353 if (size == 0)
2354 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002355 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002356 if (size == 1)
2357 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002358
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002359 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002360 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002361 if (!res)
2362 return NULL;
2363 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002364 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002365 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002366}
2367
Victor Stinnere57b1c02011-09-28 22:20:48 +02002368static PyObject*
2369_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002370{
2371 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002372 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002373
Serhiy Storchaka678db842013-01-26 12:16:36 +02002374 if (size == 0)
2375 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002376 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002377 if (size == 1)
2378 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002379
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002380 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002381 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002382 if (!res)
2383 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002384 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002385 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002386 else {
2387 _PyUnicode_CONVERT_BYTES(
2388 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2389 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002390 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002391 return res;
2392}
2393
Victor Stinnere57b1c02011-09-28 22:20:48 +02002394static PyObject*
2395_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002396{
2397 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002398 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002399
Serhiy Storchaka678db842013-01-26 12:16:36 +02002400 if (size == 0)
2401 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002402 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002403 if (size == 1)
2404 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002405
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002406 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002407 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002408 if (!res)
2409 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002410 if (max_char < 256)
2411 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2412 PyUnicode_1BYTE_DATA(res));
2413 else if (max_char < 0x10000)
2414 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2415 PyUnicode_2BYTE_DATA(res));
2416 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002417 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002418 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002419 return res;
2420}
2421
2422PyObject*
2423PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2424{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002425 if (size < 0) {
2426 PyErr_SetString(PyExc_ValueError, "size must be positive");
2427 return NULL;
2428 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002429 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002430 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002431 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002432 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002433 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002434 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002435 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002436 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002437 PyErr_SetString(PyExc_SystemError, "invalid kind");
2438 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002440}
2441
Victor Stinnerece58de2012-04-23 23:36:38 +02002442Py_UCS4
2443_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2444{
2445 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002446 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002447
2448 assert(PyUnicode_IS_READY(unicode));
2449 assert(0 <= start);
2450 assert(end <= PyUnicode_GET_LENGTH(unicode));
2451 assert(start <= end);
2452
2453 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2454 return PyUnicode_MAX_CHAR_VALUE(unicode);
2455
2456 if (start == end)
2457 return 127;
2458
Victor Stinner94d558b2012-04-27 22:26:58 +02002459 if (PyUnicode_IS_ASCII(unicode))
2460 return 127;
2461
Victor Stinnerece58de2012-04-23 23:36:38 +02002462 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002463 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002464 endptr = (char *)startptr + end * kind;
2465 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002466 switch(kind) {
2467 case PyUnicode_1BYTE_KIND:
2468 return ucs1lib_find_max_char(startptr, endptr);
2469 case PyUnicode_2BYTE_KIND:
2470 return ucs2lib_find_max_char(startptr, endptr);
2471 case PyUnicode_4BYTE_KIND:
2472 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002473 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002474 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002475 }
2476}
2477
Victor Stinner25a4b292011-10-06 12:31:55 +02002478/* Ensure that a string uses the most efficient storage, if it is not the
2479 case: create a new string with of the right kind. Write NULL into *p_unicode
2480 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002481static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002482unicode_adjust_maxchar(PyObject **p_unicode)
2483{
2484 PyObject *unicode, *copy;
2485 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002486 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002487 unsigned int kind;
2488
2489 assert(p_unicode != NULL);
2490 unicode = *p_unicode;
2491 assert(PyUnicode_IS_READY(unicode));
2492 if (PyUnicode_IS_ASCII(unicode))
2493 return;
2494
2495 len = PyUnicode_GET_LENGTH(unicode);
2496 kind = PyUnicode_KIND(unicode);
2497 if (kind == PyUnicode_1BYTE_KIND) {
2498 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002499 max_char = ucs1lib_find_max_char(u, u + len);
2500 if (max_char >= 128)
2501 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002502 }
2503 else if (kind == PyUnicode_2BYTE_KIND) {
2504 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002505 max_char = ucs2lib_find_max_char(u, u + len);
2506 if (max_char >= 256)
2507 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002508 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002509 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002510 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002511 max_char = ucs4lib_find_max_char(u, u + len);
2512 if (max_char >= 0x10000)
2513 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002514 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002515 else
2516 Py_UNREACHABLE();
2517
Victor Stinner25a4b292011-10-06 12:31:55 +02002518 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002519 if (copy != NULL)
2520 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002521 Py_DECREF(unicode);
2522 *p_unicode = copy;
2523}
2524
Victor Stinner034f6cf2011-09-30 02:26:44 +02002525PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002526_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002527{
Victor Stinner87af4f22011-11-21 23:03:47 +01002528 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002529 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002530
Victor Stinner034f6cf2011-09-30 02:26:44 +02002531 if (!PyUnicode_Check(unicode)) {
2532 PyErr_BadInternalCall();
2533 return NULL;
2534 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002535 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002536 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002537
Victor Stinner87af4f22011-11-21 23:03:47 +01002538 length = PyUnicode_GET_LENGTH(unicode);
2539 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002540 if (!copy)
2541 return NULL;
2542 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2543
Christian Heimesf051e432016-09-13 20:22:02 +02002544 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002545 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002546 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002547 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002548}
2549
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002550
Victor Stinnerbc603d12011-10-02 01:00:40 +02002551/* Widen Unicode objects to larger buffers. Don't write terminating null
2552 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002553
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002554static void*
2555unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002556{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002557 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002558
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002559 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002560 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002561 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002562 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002563 if (!result)
2564 return PyErr_NoMemory();
2565 assert(skind == PyUnicode_1BYTE_KIND);
2566 _PyUnicode_CONVERT_BYTES(
2567 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002568 (const Py_UCS1 *)data,
2569 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002570 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002571 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002572 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002573 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002574 if (!result)
2575 return PyErr_NoMemory();
2576 if (skind == PyUnicode_2BYTE_KIND) {
2577 _PyUnicode_CONVERT_BYTES(
2578 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002579 (const Py_UCS2 *)data,
2580 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002581 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002582 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002583 else {
2584 assert(skind == PyUnicode_1BYTE_KIND);
2585 _PyUnicode_CONVERT_BYTES(
2586 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002587 (const Py_UCS1 *)data,
2588 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002589 result);
2590 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002591 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002592 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002593 Py_UNREACHABLE();
2594 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002595 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002596}
2597
2598static Py_UCS4*
2599as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2600 int copy_null)
2601{
2602 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002603 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002604 Py_ssize_t len, targetlen;
2605 if (PyUnicode_READY(string) == -1)
2606 return NULL;
2607 kind = PyUnicode_KIND(string);
2608 data = PyUnicode_DATA(string);
2609 len = PyUnicode_GET_LENGTH(string);
2610 targetlen = len;
2611 if (copy_null)
2612 targetlen++;
2613 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002614 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002615 if (!target) {
2616 PyErr_NoMemory();
2617 return NULL;
2618 }
2619 }
2620 else {
2621 if (targetsize < targetlen) {
2622 PyErr_Format(PyExc_SystemError,
2623 "string is longer than the buffer");
2624 if (copy_null && 0 < targetsize)
2625 target[0] = 0;
2626 return NULL;
2627 }
2628 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002629 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002630 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002631 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002632 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002633 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002634 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002635 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2636 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002637 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002638 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002639 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002640 else {
2641 Py_UNREACHABLE();
2642 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002643 if (copy_null)
2644 target[len] = 0;
2645 return target;
2646}
2647
2648Py_UCS4*
2649PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2650 int copy_null)
2651{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002652 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002653 PyErr_BadInternalCall();
2654 return NULL;
2655 }
2656 return as_ucs4(string, target, targetsize, copy_null);
2657}
2658
2659Py_UCS4*
2660PyUnicode_AsUCS4Copy(PyObject *string)
2661{
2662 return as_ucs4(string, NULL, 0, 1);
2663}
2664
Victor Stinner15a11362012-10-06 23:48:20 +02002665/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002666 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2667 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2668#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002669
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002670static int
2671unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2672 Py_ssize_t width, Py_ssize_t precision)
2673{
2674 Py_ssize_t length, fill, arglen;
2675 Py_UCS4 maxchar;
2676
2677 if (PyUnicode_READY(str) == -1)
2678 return -1;
2679
2680 length = PyUnicode_GET_LENGTH(str);
2681 if ((precision == -1 || precision >= length)
2682 && width <= length)
2683 return _PyUnicodeWriter_WriteStr(writer, str);
2684
2685 if (precision != -1)
2686 length = Py_MIN(precision, length);
2687
2688 arglen = Py_MAX(length, width);
2689 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2690 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2691 else
2692 maxchar = writer->maxchar;
2693
2694 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2695 return -1;
2696
2697 if (width > length) {
2698 fill = width - length;
2699 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2700 return -1;
2701 writer->pos += fill;
2702 }
2703
2704 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2705 str, 0, length);
2706 writer->pos += length;
2707 return 0;
2708}
2709
2710static int
Victor Stinner998b8062018-09-12 00:23:25 +02002711unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002712 Py_ssize_t width, Py_ssize_t precision)
2713{
2714 /* UTF-8 */
2715 Py_ssize_t length;
2716 PyObject *unicode;
2717 int res;
2718
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002719 if (precision == -1) {
2720 length = strlen(str);
2721 }
2722 else {
2723 length = 0;
2724 while (length < precision && str[length]) {
2725 length++;
2726 }
2727 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002728 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2729 if (unicode == NULL)
2730 return -1;
2731
2732 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2733 Py_DECREF(unicode);
2734 return res;
2735}
2736
Victor Stinner96865452011-03-01 23:44:09 +00002737static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002738unicode_fromformat_arg(_PyUnicodeWriter *writer,
2739 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002740{
Victor Stinnere215d962012-10-06 23:03:36 +02002741 const char *p;
2742 Py_ssize_t len;
2743 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002744 Py_ssize_t width;
2745 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002746 int longflag;
2747 int longlongflag;
2748 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002749 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002750
2751 p = f;
2752 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002753 zeropad = 0;
2754 if (*f == '0') {
2755 zeropad = 1;
2756 f++;
2757 }
Victor Stinner96865452011-03-01 23:44:09 +00002758
2759 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002760 width = -1;
2761 if (Py_ISDIGIT((unsigned)*f)) {
2762 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002763 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002764 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002765 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002766 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002767 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002768 return NULL;
2769 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002770 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002771 f++;
2772 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002773 }
2774 precision = -1;
2775 if (*f == '.') {
2776 f++;
2777 if (Py_ISDIGIT((unsigned)*f)) {
2778 precision = (*f - '0');
2779 f++;
2780 while (Py_ISDIGIT((unsigned)*f)) {
2781 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2782 PyErr_SetString(PyExc_ValueError,
2783 "precision too big");
2784 return NULL;
2785 }
2786 precision = (precision * 10) + (*f - '0');
2787 f++;
2788 }
2789 }
Victor Stinner96865452011-03-01 23:44:09 +00002790 if (*f == '%') {
2791 /* "%.3%s" => f points to "3" */
2792 f--;
2793 }
2794 }
2795 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002796 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002797 f--;
2798 }
Victor Stinner96865452011-03-01 23:44:09 +00002799
2800 /* Handle %ld, %lu, %lld and %llu. */
2801 longflag = 0;
2802 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002803 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002804 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002805 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002806 longflag = 1;
2807 ++f;
2808 }
Victor Stinner96865452011-03-01 23:44:09 +00002809 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002810 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002811 longlongflag = 1;
2812 f += 2;
2813 }
Victor Stinner96865452011-03-01 23:44:09 +00002814 }
2815 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002816 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002817 size_tflag = 1;
2818 ++f;
2819 }
Victor Stinnere215d962012-10-06 23:03:36 +02002820
2821 if (f[1] == '\0')
2822 writer->overallocate = 0;
2823
2824 switch (*f) {
2825 case 'c':
2826 {
2827 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002828 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002829 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002830 "character argument not in range(0x110000)");
2831 return NULL;
2832 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002833 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002834 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002835 break;
2836 }
2837
2838 case 'i':
2839 case 'd':
2840 case 'u':
2841 case 'x':
2842 {
2843 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002844 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002845 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002846
2847 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002848 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002849 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002850 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002851 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002852 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002853 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002854 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002855 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002856 va_arg(*vargs, size_t));
2857 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002858 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002859 va_arg(*vargs, unsigned int));
2860 }
2861 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002862 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002863 }
2864 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002865 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002866 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002867 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002868 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002869 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002870 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002871 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002872 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002873 va_arg(*vargs, Py_ssize_t));
2874 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002875 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002876 va_arg(*vargs, int));
2877 }
2878 assert(len >= 0);
2879
Victor Stinnere215d962012-10-06 23:03:36 +02002880 if (precision < len)
2881 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002882
2883 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002884 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2885 return NULL;
2886
Victor Stinnere215d962012-10-06 23:03:36 +02002887 if (width > precision) {
2888 Py_UCS4 fillchar;
2889 fill = width - precision;
2890 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002891 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2892 return NULL;
2893 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002894 }
Victor Stinner15a11362012-10-06 23:48:20 +02002895 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002896 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002897 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2898 return NULL;
2899 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002900 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002901
Victor Stinner4a587072013-11-19 12:54:53 +01002902 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2903 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002904 break;
2905 }
2906
2907 case 'p':
2908 {
2909 char number[MAX_LONG_LONG_CHARS];
2910
2911 len = sprintf(number, "%p", va_arg(*vargs, void*));
2912 assert(len >= 0);
2913
2914 /* %p is ill-defined: ensure leading 0x. */
2915 if (number[1] == 'X')
2916 number[1] = 'x';
2917 else if (number[1] != 'x') {
2918 memmove(number + 2, number,
2919 strlen(number) + 1);
2920 number[0] = '0';
2921 number[1] = 'x';
2922 len += 2;
2923 }
2924
Victor Stinner4a587072013-11-19 12:54:53 +01002925 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002926 return NULL;
2927 break;
2928 }
2929
2930 case 's':
2931 {
2932 /* UTF-8 */
2933 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002934 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002935 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002936 break;
2937 }
2938
2939 case 'U':
2940 {
2941 PyObject *obj = va_arg(*vargs, PyObject *);
2942 assert(obj && _PyUnicode_CHECK(obj));
2943
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002944 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002945 return NULL;
2946 break;
2947 }
2948
2949 case 'V':
2950 {
2951 PyObject *obj = va_arg(*vargs, PyObject *);
2952 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002953 if (obj) {
2954 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002955 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002956 return NULL;
2957 }
2958 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002959 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002960 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002961 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002962 }
2963 break;
2964 }
2965
2966 case 'S':
2967 {
2968 PyObject *obj = va_arg(*vargs, PyObject *);
2969 PyObject *str;
2970 assert(obj);
2971 str = PyObject_Str(obj);
2972 if (!str)
2973 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002974 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002975 Py_DECREF(str);
2976 return NULL;
2977 }
2978 Py_DECREF(str);
2979 break;
2980 }
2981
2982 case 'R':
2983 {
2984 PyObject *obj = va_arg(*vargs, PyObject *);
2985 PyObject *repr;
2986 assert(obj);
2987 repr = PyObject_Repr(obj);
2988 if (!repr)
2989 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002990 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002991 Py_DECREF(repr);
2992 return NULL;
2993 }
2994 Py_DECREF(repr);
2995 break;
2996 }
2997
2998 case 'A':
2999 {
3000 PyObject *obj = va_arg(*vargs, PyObject *);
3001 PyObject *ascii;
3002 assert(obj);
3003 ascii = PyObject_ASCII(obj);
3004 if (!ascii)
3005 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003006 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003007 Py_DECREF(ascii);
3008 return NULL;
3009 }
3010 Py_DECREF(ascii);
3011 break;
3012 }
3013
3014 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02003015 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003016 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003017 break;
3018
3019 default:
3020 /* if we stumble upon an unknown formatting code, copy the rest
3021 of the format string to the output string. (we cannot just
3022 skip the code, since there's no way to know what's in the
3023 argument list) */
3024 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01003025 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003026 return NULL;
3027 f = p+len;
3028 return f;
3029 }
3030
3031 f++;
Victor Stinner96865452011-03-01 23:44:09 +00003032 return f;
3033}
3034
Walter Dörwaldd2034312007-05-18 16:29:38 +00003035PyObject *
3036PyUnicode_FromFormatV(const char *format, va_list vargs)
3037{
Victor Stinnere215d962012-10-06 23:03:36 +02003038 va_list vargs2;
3039 const char *f;
3040 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003041
Victor Stinner8f674cc2013-04-17 23:02:17 +02003042 _PyUnicodeWriter_Init(&writer);
3043 writer.min_length = strlen(format) + 100;
3044 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003045
Benjamin Peterson0c212142016-09-20 20:39:33 -07003046 // Copy varags to be able to pass a reference to a subfunction.
3047 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003048
3049 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003050 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003051 f = unicode_fromformat_arg(&writer, f, &vargs2);
3052 if (f == NULL)
3053 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003054 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003055 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003056 const char *p;
3057 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003058
Victor Stinnere215d962012-10-06 23:03:36 +02003059 p = f;
3060 do
3061 {
3062 if ((unsigned char)*p > 127) {
3063 PyErr_Format(PyExc_ValueError,
3064 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3065 "string, got a non-ASCII byte: 0x%02x",
3066 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003067 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003068 }
3069 p++;
3070 }
3071 while (*p != '\0' && *p != '%');
3072 len = p - f;
3073
3074 if (*p == '\0')
3075 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003076
3077 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003078 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003079
3080 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003081 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003082 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003083 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003084 return _PyUnicodeWriter_Finish(&writer);
3085
3086 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003087 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003088 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003089 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003090}
3091
Walter Dörwaldd2034312007-05-18 16:29:38 +00003092PyObject *
3093PyUnicode_FromFormat(const char *format, ...)
3094{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003095 PyObject* ret;
3096 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003097
3098#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003099 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003100#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003101 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003102#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003103 ret = PyUnicode_FromFormatV(format, vargs);
3104 va_end(vargs);
3105 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003106}
3107
Serhiy Storchakac46db922018-10-23 22:58:24 +03003108static Py_ssize_t
3109unicode_get_widechar_size(PyObject *unicode)
3110{
3111 Py_ssize_t res;
3112
3113 assert(unicode != NULL);
3114 assert(_PyUnicode_CHECK(unicode));
3115
3116 if (_PyUnicode_WSTR(unicode) != NULL) {
3117 return PyUnicode_WSTR_LENGTH(unicode);
3118 }
3119 assert(PyUnicode_IS_READY(unicode));
3120
3121 res = _PyUnicode_LENGTH(unicode);
3122#if SIZEOF_WCHAR_T == 2
3123 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3124 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3125 const Py_UCS4 *end = s + res;
3126 for (; s < end; ++s) {
3127 if (*s > 0xFFFF) {
3128 ++res;
3129 }
3130 }
3131 }
3132#endif
3133 return res;
3134}
3135
3136static void
3137unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3138{
3139 const wchar_t *wstr;
3140
3141 assert(unicode != NULL);
3142 assert(_PyUnicode_CHECK(unicode));
3143
3144 wstr = _PyUnicode_WSTR(unicode);
3145 if (wstr != NULL) {
3146 memcpy(w, wstr, size * sizeof(wchar_t));
3147 return;
3148 }
3149 assert(PyUnicode_IS_READY(unicode));
3150
3151 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3152 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3153 for (; size--; ++s, ++w) {
3154 *w = *s;
3155 }
3156 }
3157 else {
3158#if SIZEOF_WCHAR_T == 4
3159 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3160 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3161 for (; size--; ++s, ++w) {
3162 *w = *s;
3163 }
3164#else
3165 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3166 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3167 for (; size--; ++s, ++w) {
3168 Py_UCS4 ch = *s;
3169 if (ch > 0xFFFF) {
3170 assert(ch <= MAX_UNICODE);
3171 /* encode surrogate pair in this case */
3172 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3173 if (!size--)
3174 break;
3175 *w = Py_UNICODE_LOW_SURROGATE(ch);
3176 }
3177 else {
3178 *w = ch;
3179 }
3180 }
3181#endif
3182 }
3183}
3184
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003185#ifdef HAVE_WCHAR_H
3186
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003187/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003188
Victor Stinnerd88d9832011-09-06 02:00:05 +02003189 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003190 character) required to convert the unicode object. Ignore size argument.
3191
Victor Stinnerd88d9832011-09-06 02:00:05 +02003192 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003193 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003194 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003195Py_ssize_t
3196PyUnicode_AsWideChar(PyObject *unicode,
3197 wchar_t *w,
3198 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003199{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003200 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003201
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003202 if (unicode == NULL) {
3203 PyErr_BadInternalCall();
3204 return -1;
3205 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003206 if (!PyUnicode_Check(unicode)) {
3207 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003208 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003209 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003210
3211 res = unicode_get_widechar_size(unicode);
3212 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003213 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003214 }
3215
3216 if (size > res) {
3217 size = res + 1;
3218 }
3219 else {
3220 res = size;
3221 }
3222 unicode_copy_as_widechar(unicode, w, size);
3223 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003224}
3225
Victor Stinner137c34c2010-09-29 10:25:54 +00003226wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003227PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003228 Py_ssize_t *size)
3229{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003230 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003231 Py_ssize_t buflen;
3232
3233 if (unicode == NULL) {
3234 PyErr_BadInternalCall();
3235 return NULL;
3236 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003237 if (!PyUnicode_Check(unicode)) {
3238 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003239 return NULL;
3240 }
3241
Serhiy Storchakac46db922018-10-23 22:58:24 +03003242 buflen = unicode_get_widechar_size(unicode);
3243 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003244 if (buffer == NULL) {
3245 PyErr_NoMemory();
3246 return NULL;
3247 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003248 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3249 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003250 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003251 }
3252 else if (wcslen(buffer) != (size_t)buflen) {
3253 PyMem_FREE(buffer);
3254 PyErr_SetString(PyExc_ValueError,
3255 "embedded null character");
3256 return NULL;
3257 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003258 return buffer;
3259}
3260
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003261#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262
Alexander Belopolsky40018472011-02-26 01:02:56 +00003263PyObject *
3264PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003265{
Victor Stinner8faf8212011-12-08 22:14:11 +01003266 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003267 PyErr_SetString(PyExc_ValueError,
3268 "chr() arg not in range(0x110000)");
3269 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003270 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003271
Victor Stinner985a82a2014-01-03 12:53:47 +01003272 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003273}
3274
Alexander Belopolsky40018472011-02-26 01:02:56 +00003275PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003276PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003277{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003278 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003279 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003280 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003281 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003282 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003283 Py_INCREF(obj);
3284 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003285 }
3286 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003287 /* For a Unicode subtype that's not a Unicode object,
3288 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003289 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003290 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003291 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003292 "Can't convert '%.100s' object to str implicitly",
3293 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003294 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003295}
3296
Alexander Belopolsky40018472011-02-26 01:02:56 +00003297PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003298PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003299 const char *encoding,
3300 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003301{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003302 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003303 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003304
Guido van Rossumd57fd912000-03-10 22:53:23 +00003305 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003306 PyErr_BadInternalCall();
3307 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003308 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003309
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003310 /* Decoding bytes objects is the most common case and should be fast */
3311 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003312 if (PyBytes_GET_SIZE(obj) == 0) {
3313 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3314 return NULL;
3315 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003316 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003317 }
3318 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003319 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3320 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003321 }
3322
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003323 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003324 PyErr_SetString(PyExc_TypeError,
3325 "decoding str is not supported");
3326 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003327 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003328
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003329 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3330 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3331 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003332 "decoding to str: need a bytes-like object, %.80s found",
3333 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003334 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003335 }
Tim Petersced69f82003-09-16 20:30:58 +00003336
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003337 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003338 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003339 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3340 return NULL;
3341 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003342 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003344
Serhiy Storchaka05997252013-01-26 12:14:02 +02003345 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003346 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003347 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003348}
3349
Victor Stinnerebe17e02016-10-12 13:57:45 +02003350/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3351 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3352 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003353int
3354_Py_normalize_encoding(const char *encoding,
3355 char *lower,
3356 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003357{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003358 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003359 char *l;
3360 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003361 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003362
Victor Stinner942889a2016-09-05 15:40:10 -07003363 assert(encoding != NULL);
3364
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003365 e = encoding;
3366 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003367 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003368 punct = 0;
3369 while (1) {
3370 char c = *e;
3371 if (c == 0) {
3372 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003373 }
Victor Stinner942889a2016-09-05 15:40:10 -07003374
3375 if (Py_ISALNUM(c) || c == '.') {
3376 if (punct && l != lower) {
3377 if (l == l_end) {
3378 return 0;
3379 }
3380 *l++ = '_';
3381 }
3382 punct = 0;
3383
3384 if (l == l_end) {
3385 return 0;
3386 }
3387 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003388 }
3389 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003390 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003391 }
Victor Stinner942889a2016-09-05 15:40:10 -07003392
3393 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003394 }
3395 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003396 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003397}
3398
Alexander Belopolsky40018472011-02-26 01:02:56 +00003399PyObject *
3400PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003401 Py_ssize_t size,
3402 const char *encoding,
3403 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003404{
3405 PyObject *buffer = NULL, *unicode;
3406 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003407 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3408
Victor Stinner22eb6892019-06-26 00:51:05 +02003409 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3410 return NULL;
3411 }
3412
Victor Stinnered076ed2019-06-26 01:49:32 +02003413 if (size == 0) {
3414 _Py_RETURN_UNICODE_EMPTY();
3415 }
3416
Victor Stinner942889a2016-09-05 15:40:10 -07003417 if (encoding == NULL) {
3418 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3419 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003420
Fred Drakee4315f52000-05-09 19:53:39 +00003421 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003422 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3423 char *lower = buflower;
3424
3425 /* Fast paths */
3426 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3427 lower += 3;
3428 if (*lower == '_') {
3429 /* Match "utf8" and "utf_8" */
3430 lower++;
3431 }
3432
3433 if (lower[0] == '8' && lower[1] == 0) {
3434 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3435 }
3436 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3437 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3438 }
3439 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3440 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3441 }
3442 }
3443 else {
3444 if (strcmp(lower, "ascii") == 0
3445 || strcmp(lower, "us_ascii") == 0) {
3446 return PyUnicode_DecodeASCII(s, size, errors);
3447 }
Steve Dowercc16be82016-09-08 10:35:16 -07003448 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003449 else if (strcmp(lower, "mbcs") == 0) {
3450 return PyUnicode_DecodeMBCS(s, size, errors);
3451 }
3452 #endif
3453 else if (strcmp(lower, "latin1") == 0
3454 || strcmp(lower, "latin_1") == 0
3455 || strcmp(lower, "iso_8859_1") == 0
3456 || strcmp(lower, "iso8859_1") == 0) {
3457 return PyUnicode_DecodeLatin1(s, size, errors);
3458 }
3459 }
Victor Stinner37296e82010-06-10 13:36:23 +00003460 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003461
3462 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003463 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003464 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003465 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003466 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003467 if (buffer == NULL)
3468 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003469 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003470 if (unicode == NULL)
3471 goto onError;
3472 if (!PyUnicode_Check(unicode)) {
3473 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003474 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003475 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003476 encoding,
3477 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003478 Py_DECREF(unicode);
3479 goto onError;
3480 }
3481 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003482 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003483
Benjamin Peterson29060642009-01-31 22:14:21 +00003484 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003485 Py_XDECREF(buffer);
3486 return NULL;
3487}
3488
Alexander Belopolsky40018472011-02-26 01:02:56 +00003489PyObject *
3490PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003491 const char *encoding,
3492 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003493{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003494 if (!PyUnicode_Check(unicode)) {
3495 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003496 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003497 }
3498
Serhiy Storchaka00939072016-10-27 21:05:49 +03003499 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3500 "PyUnicode_AsDecodedObject() is deprecated; "
3501 "use PyCodec_Decode() to decode from str", 1) < 0)
3502 return NULL;
3503
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003504 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003505 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003506
3507 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003508 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003509}
3510
Alexander Belopolsky40018472011-02-26 01:02:56 +00003511PyObject *
3512PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003513 const char *encoding,
3514 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003515{
3516 PyObject *v;
3517
3518 if (!PyUnicode_Check(unicode)) {
3519 PyErr_BadArgument();
3520 goto onError;
3521 }
3522
Serhiy Storchaka00939072016-10-27 21:05:49 +03003523 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3524 "PyUnicode_AsDecodedUnicode() is deprecated; "
3525 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3526 return NULL;
3527
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003528 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003529 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003530
3531 /* Decode via the codec registry */
3532 v = PyCodec_Decode(unicode, encoding, errors);
3533 if (v == NULL)
3534 goto onError;
3535 if (!PyUnicode_Check(v)) {
3536 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003537 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003538 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003539 encoding,
3540 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003541 Py_DECREF(v);
3542 goto onError;
3543 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003544 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003545
Benjamin Peterson29060642009-01-31 22:14:21 +00003546 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003547 return NULL;
3548}
3549
Alexander Belopolsky40018472011-02-26 01:02:56 +00003550PyObject *
3551PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003552 Py_ssize_t size,
3553 const char *encoding,
3554 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003555{
3556 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003557
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003558 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003559 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003560 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003561 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3562 Py_DECREF(unicode);
3563 return v;
3564}
3565
Alexander Belopolsky40018472011-02-26 01:02:56 +00003566PyObject *
3567PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003568 const char *encoding,
3569 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003570{
3571 PyObject *v;
3572
3573 if (!PyUnicode_Check(unicode)) {
3574 PyErr_BadArgument();
3575 goto onError;
3576 }
3577
Serhiy Storchaka00939072016-10-27 21:05:49 +03003578 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3579 "PyUnicode_AsEncodedObject() is deprecated; "
3580 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3581 "or PyCodec_Encode() for generic encoding", 1) < 0)
3582 return NULL;
3583
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003584 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003585 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003586
3587 /* Encode via the codec registry */
3588 v = PyCodec_Encode(unicode, encoding, errors);
3589 if (v == NULL)
3590 goto onError;
3591 return v;
3592
Benjamin Peterson29060642009-01-31 22:14:21 +00003593 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003594 return NULL;
3595}
3596
Victor Stinner1b579672011-12-17 05:47:23 +01003597
Victor Stinner2cba6b82018-01-10 22:46:15 +01003598static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003599unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003600 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003601{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003602 Py_ssize_t wlen;
3603 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3604 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003605 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003606 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003607
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003608 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003609 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003610 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003611 return NULL;
3612 }
3613
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003614 char *str;
3615 size_t error_pos;
3616 const char *reason;
3617 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003618 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003619 PyMem_Free(wstr);
3620
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003621 if (res != 0) {
3622 if (res == -2) {
3623 PyObject *exc;
3624 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3625 "locale", unicode,
3626 (Py_ssize_t)error_pos,
3627 (Py_ssize_t)(error_pos+1),
3628 reason);
3629 if (exc != NULL) {
3630 PyCodec_StrictErrors(exc);
3631 Py_DECREF(exc);
3632 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003633 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003634 else if (res == -3) {
3635 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3636 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003637 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003638 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003639 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003640 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003641 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003642
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003643 PyObject *bytes = PyBytes_FromString(str);
3644 PyMem_RawFree(str);
3645 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003646}
3647
Victor Stinnerad158722010-10-27 00:25:46 +00003648PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003649PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3650{
Victor Stinner709d23d2019-05-02 14:56:30 -04003651 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3652 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003653}
3654
3655PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003656PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003657{
Victor Stinner81a7be32020-04-14 15:14:01 +02003658 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003659 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3660 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003661 return unicode_encode_utf8(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003662 fs_codec->error_handler,
3663 fs_codec->errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003664 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003665#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003666 else if (fs_codec->encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003667 return PyUnicode_AsEncodedString(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003668 fs_codec->encoding,
3669 fs_codec->errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003670 }
Victor Stinnerad158722010-10-27 00:25:46 +00003671#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003672 else {
3673 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3674 machinery is not ready and so cannot be used:
3675 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003676 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3677 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003678 assert(filesystem_errors != NULL);
3679 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3680 assert(errors != _Py_ERROR_UNKNOWN);
3681#ifdef _Py_FORCE_UTF8_FS_ENCODING
3682 return unicode_encode_utf8(unicode, errors, NULL);
3683#else
3684 return unicode_encode_locale(unicode, errors, 0);
3685#endif
3686 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003687}
3688
Alexander Belopolsky40018472011-02-26 01:02:56 +00003689PyObject *
3690PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003691 const char *encoding,
3692 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003693{
3694 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003695 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003696
Guido van Rossumd57fd912000-03-10 22:53:23 +00003697 if (!PyUnicode_Check(unicode)) {
3698 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003699 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700 }
Fred Drakee4315f52000-05-09 19:53:39 +00003701
Victor Stinner22eb6892019-06-26 00:51:05 +02003702 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3703 return NULL;
3704 }
3705
Victor Stinner942889a2016-09-05 15:40:10 -07003706 if (encoding == NULL) {
3707 return _PyUnicode_AsUTF8String(unicode, errors);
3708 }
3709
Fred Drakee4315f52000-05-09 19:53:39 +00003710 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003711 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3712 char *lower = buflower;
3713
3714 /* Fast paths */
3715 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3716 lower += 3;
3717 if (*lower == '_') {
3718 /* Match "utf8" and "utf_8" */
3719 lower++;
3720 }
3721
3722 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003723 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003724 }
3725 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3726 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3727 }
3728 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3729 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3730 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003731 }
Victor Stinner942889a2016-09-05 15:40:10 -07003732 else {
3733 if (strcmp(lower, "ascii") == 0
3734 || strcmp(lower, "us_ascii") == 0) {
3735 return _PyUnicode_AsASCIIString(unicode, errors);
3736 }
Steve Dowercc16be82016-09-08 10:35:16 -07003737#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003738 else if (strcmp(lower, "mbcs") == 0) {
3739 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3740 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003741#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003742 else if (strcmp(lower, "latin1") == 0 ||
3743 strcmp(lower, "latin_1") == 0 ||
3744 strcmp(lower, "iso_8859_1") == 0 ||
3745 strcmp(lower, "iso8859_1") == 0) {
3746 return _PyUnicode_AsLatin1String(unicode, errors);
3747 }
3748 }
Victor Stinner37296e82010-06-10 13:36:23 +00003749 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003750
3751 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003752 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003754 return NULL;
3755
3756 /* The normal path */
3757 if (PyBytes_Check(v))
3758 return v;
3759
3760 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003761 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003762 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003763 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003764
3765 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003766 "encoder %s returned bytearray instead of bytes; "
3767 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003768 encoding);
3769 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003770 Py_DECREF(v);
3771 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003772 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003773
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003774 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3775 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003776 Py_DECREF(v);
3777 return b;
3778 }
3779
3780 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003781 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003782 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003783 encoding,
3784 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003785 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003786 return NULL;
3787}
3788
Alexander Belopolsky40018472011-02-26 01:02:56 +00003789PyObject *
3790PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003791 const char *encoding,
3792 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003793{
3794 PyObject *v;
3795
3796 if (!PyUnicode_Check(unicode)) {
3797 PyErr_BadArgument();
3798 goto onError;
3799 }
3800
Serhiy Storchaka00939072016-10-27 21:05:49 +03003801 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3802 "PyUnicode_AsEncodedUnicode() is deprecated; "
3803 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3804 return NULL;
3805
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003806 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003807 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003808
3809 /* Encode via the codec registry */
3810 v = PyCodec_Encode(unicode, encoding, errors);
3811 if (v == NULL)
3812 goto onError;
3813 if (!PyUnicode_Check(v)) {
3814 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003815 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003816 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003817 encoding,
3818 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003819 Py_DECREF(v);
3820 goto onError;
3821 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003822 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003823
Benjamin Peterson29060642009-01-31 22:14:21 +00003824 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003825 return NULL;
3826}
3827
Victor Stinner2cba6b82018-01-10 22:46:15 +01003828static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003829unicode_decode_locale(const char *str, Py_ssize_t len,
3830 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003831{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003832 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3833 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003834 return NULL;
3835 }
3836
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003837 wchar_t *wstr;
3838 size_t wlen;
3839 const char *reason;
3840 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003841 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003842 if (res != 0) {
3843 if (res == -2) {
3844 PyObject *exc;
3845 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3846 "locale", str, len,
3847 (Py_ssize_t)wlen,
3848 (Py_ssize_t)(wlen + 1),
3849 reason);
3850 if (exc != NULL) {
3851 PyCodec_StrictErrors(exc);
3852 Py_DECREF(exc);
3853 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003854 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003855 else if (res == -3) {
3856 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3857 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003858 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003859 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003860 }
Victor Stinner2f197072011-12-17 07:08:30 +01003861 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003862 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003863
3864 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3865 PyMem_RawFree(wstr);
3866 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003867}
3868
3869PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003870PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3871 const char *errors)
3872{
Victor Stinner709d23d2019-05-02 14:56:30 -04003873 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3874 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003875}
3876
3877PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003878PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003879{
3880 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003881 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3882 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003883}
3884
3885
3886PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003887PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003888 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003889 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3890}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003891
Christian Heimes5894ba72007-11-04 11:43:14 +00003892PyObject*
3893PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3894{
Victor Stinner81a7be32020-04-14 15:14:01 +02003895 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003896 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3897 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003898 return unicode_decode_utf8(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02003899 fs_codec->error_handler,
3900 fs_codec->errors,
Victor Stinner709d23d2019-05-02 14:56:30 -04003901 NULL);
3902 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003903#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003904 else if (fs_codec->encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003905 return PyUnicode_Decode(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02003906 fs_codec->encoding,
3907 fs_codec->errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003908 }
Victor Stinnerad158722010-10-27 00:25:46 +00003909#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003910 else {
3911 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3912 machinery is not ready and so cannot be used:
3913 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003914 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3915 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003916 assert(filesystem_errors != NULL);
3917 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3918 assert(errors != _Py_ERROR_UNKNOWN);
3919#ifdef _Py_FORCE_UTF8_FS_ENCODING
3920 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3921#else
3922 return unicode_decode_locale(s, size, errors, 0);
3923#endif
3924 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003925}
3926
Martin v. Löwis011e8422009-05-05 04:43:17 +00003927
3928int
3929PyUnicode_FSConverter(PyObject* arg, void* addr)
3930{
Brett Cannonec6ce872016-09-06 15:50:29 -07003931 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003932 PyObject *output = NULL;
3933 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03003934 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003935 if (arg == NULL) {
3936 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003937 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003938 return 1;
3939 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003940 path = PyOS_FSPath(arg);
3941 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003942 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003943 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003944 if (PyBytes_Check(path)) {
3945 output = path;
3946 }
3947 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3948 output = PyUnicode_EncodeFSDefault(path);
3949 Py_DECREF(path);
3950 if (!output) {
3951 return 0;
3952 }
3953 assert(PyBytes_Check(output));
3954 }
3955
Victor Stinner0ea2a462010-04-30 00:22:08 +00003956 size = PyBytes_GET_SIZE(output);
3957 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003958 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003959 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003960 Py_DECREF(output);
3961 return 0;
3962 }
3963 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003964 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003965}
3966
3967
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003968int
3969PyUnicode_FSDecoder(PyObject* arg, void* addr)
3970{
Brett Cannona5711202016-09-06 19:36:01 -07003971 int is_buffer = 0;
3972 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003973 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003974 if (arg == NULL) {
3975 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003976 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003977 return 1;
3978 }
Brett Cannona5711202016-09-06 19:36:01 -07003979
3980 is_buffer = PyObject_CheckBuffer(arg);
3981 if (!is_buffer) {
3982 path = PyOS_FSPath(arg);
3983 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003984 return 0;
3985 }
Brett Cannona5711202016-09-06 19:36:01 -07003986 }
3987 else {
3988 path = arg;
3989 Py_INCREF(arg);
3990 }
3991
3992 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003993 output = path;
3994 }
3995 else if (PyBytes_Check(path) || is_buffer) {
3996 PyObject *path_bytes = NULL;
3997
3998 if (!PyBytes_Check(path) &&
3999 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02004000 "path should be string, bytes, or os.PathLike, not %.200s",
4001 Py_TYPE(arg)->tp_name)) {
4002 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004003 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07004004 }
4005 path_bytes = PyBytes_FromObject(path);
4006 Py_DECREF(path);
4007 if (!path_bytes) {
4008 return 0;
4009 }
4010 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4011 PyBytes_GET_SIZE(path_bytes));
4012 Py_DECREF(path_bytes);
4013 if (!output) {
4014 return 0;
4015 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004016 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004017 else {
4018 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02004019 "path should be string, bytes, or os.PathLike, not %.200s",
4020 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07004021 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004022 return 0;
4023 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004024 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02004025 Py_DECREF(output);
4026 return 0;
4027 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004028 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02004029 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004030 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004031 Py_DECREF(output);
4032 return 0;
4033 }
4034 *(PyObject**)addr = output;
4035 return Py_CLEANUP_SUPPORTED;
4036}
4037
4038
Inada Naoki02a4d572020-02-27 13:48:59 +09004039static int unicode_fill_utf8(PyObject *unicode);
4040
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004041const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004042PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004043{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004044 if (!PyUnicode_Check(unicode)) {
4045 PyErr_BadArgument();
4046 return NULL;
4047 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004048 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004049 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004050
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004051 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004052 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004053 return NULL;
4054 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004055 }
4056
4057 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004058 *psize = PyUnicode_UTF8_LENGTH(unicode);
4059 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004060}
4061
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004062const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004063PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004064{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004065 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4066}
4067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004068Py_UNICODE *
4069PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4070{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004071 if (!PyUnicode_Check(unicode)) {
4072 PyErr_BadArgument();
4073 return NULL;
4074 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004075 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4076 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004077 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004078 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004079 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004080
Serhiy Storchakac46db922018-10-23 22:58:24 +03004081 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4082 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4083 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004084 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004085 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004086 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4087 if (w == NULL) {
4088 PyErr_NoMemory();
4089 return NULL;
4090 }
4091 unicode_copy_as_widechar(unicode, w, wlen + 1);
4092 _PyUnicode_WSTR(unicode) = w;
4093 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4094 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004095 }
4096 }
4097 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004098 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004099 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004100}
4101
Alexander Belopolsky40018472011-02-26 01:02:56 +00004102Py_UNICODE *
4103PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004104{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004105 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004106}
4107
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004108const Py_UNICODE *
4109_PyUnicode_AsUnicode(PyObject *unicode)
4110{
4111 Py_ssize_t size;
4112 const Py_UNICODE *wstr;
4113
4114 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4115 if (wstr && wcslen(wstr) != (size_t)size) {
4116 PyErr_SetString(PyExc_ValueError, "embedded null character");
4117 return NULL;
4118 }
4119 return wstr;
4120}
4121
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004122
Alexander Belopolsky40018472011-02-26 01:02:56 +00004123Py_ssize_t
4124PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125{
4126 if (!PyUnicode_Check(unicode)) {
4127 PyErr_BadArgument();
4128 goto onError;
4129 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004130 if (_PyUnicode_WSTR(unicode) == NULL) {
4131 if (PyUnicode_AsUnicode(unicode) == NULL)
4132 goto onError;
4133 }
4134 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135
Benjamin Peterson29060642009-01-31 22:14:21 +00004136 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004137 return -1;
4138}
4139
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004140Py_ssize_t
4141PyUnicode_GetLength(PyObject *unicode)
4142{
Victor Stinner07621332012-06-16 04:53:46 +02004143 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004144 PyErr_BadArgument();
4145 return -1;
4146 }
Victor Stinner07621332012-06-16 04:53:46 +02004147 if (PyUnicode_READY(unicode) == -1)
4148 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004149 return PyUnicode_GET_LENGTH(unicode);
4150}
4151
4152Py_UCS4
4153PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4154{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004155 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004156 int kind;
4157
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004158 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004159 PyErr_BadArgument();
4160 return (Py_UCS4)-1;
4161 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004162 if (PyUnicode_READY(unicode) == -1) {
4163 return (Py_UCS4)-1;
4164 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004165 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004166 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004167 return (Py_UCS4)-1;
4168 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004169 data = PyUnicode_DATA(unicode);
4170 kind = PyUnicode_KIND(unicode);
4171 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004172}
4173
4174int
4175PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4176{
4177 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004178 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004179 return -1;
4180 }
Victor Stinner488fa492011-12-12 00:01:39 +01004181 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004182 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004183 PyErr_SetString(PyExc_IndexError, "string index out of range");
4184 return -1;
4185 }
Victor Stinner488fa492011-12-12 00:01:39 +01004186 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004187 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004188 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4189 PyErr_SetString(PyExc_ValueError, "character out of range");
4190 return -1;
4191 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004192 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4193 index, ch);
4194 return 0;
4195}
4196
Alexander Belopolsky40018472011-02-26 01:02:56 +00004197const char *
4198PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004199{
Victor Stinner42cb4622010-09-01 19:39:01 +00004200 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004201}
4202
Victor Stinner554f3f02010-06-16 23:33:54 +00004203/* create or adjust a UnicodeDecodeError */
4204static void
4205make_decode_exception(PyObject **exceptionObject,
4206 const char *encoding,
4207 const char *input, Py_ssize_t length,
4208 Py_ssize_t startpos, Py_ssize_t endpos,
4209 const char *reason)
4210{
4211 if (*exceptionObject == NULL) {
4212 *exceptionObject = PyUnicodeDecodeError_Create(
4213 encoding, input, length, startpos, endpos, reason);
4214 }
4215 else {
4216 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4217 goto onError;
4218 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4219 goto onError;
4220 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4221 goto onError;
4222 }
4223 return;
4224
4225onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004226 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004227}
4228
Steve Dowercc16be82016-09-08 10:35:16 -07004229#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004230static int
4231widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4232{
4233 if (newsize > *size) {
4234 wchar_t *newbuf = *buf;
4235 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4236 PyErr_NoMemory();
4237 return -1;
4238 }
4239 *buf = newbuf;
4240 }
4241 *size = newsize;
4242 return 0;
4243}
4244
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004245/* error handling callback helper:
4246 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004247 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004248 and adjust various state variables.
4249 return 0 on success, -1 on error
4250*/
4251
Alexander Belopolsky40018472011-02-26 01:02:56 +00004252static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004253unicode_decode_call_errorhandler_wchar(
4254 const char *errors, PyObject **errorHandler,
4255 const char *encoding, const char *reason,
4256 const char **input, const char **inend, Py_ssize_t *startinpos,
4257 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004258 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004259{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004260 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004261
4262 PyObject *restuple = NULL;
4263 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004264 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004265 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004266 Py_ssize_t requiredsize;
4267 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004268 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004269 wchar_t *repwstr;
4270 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004271
4272 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004273 *errorHandler = PyCodec_LookupError(errors);
4274 if (*errorHandler == NULL)
4275 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004276 }
4277
Victor Stinner554f3f02010-06-16 23:33:54 +00004278 make_decode_exception(exceptionObject,
4279 encoding,
4280 *input, *inend - *input,
4281 *startinpos, *endinpos,
4282 reason);
4283 if (*exceptionObject == NULL)
4284 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004285
Petr Viktorinffd97532020-02-11 17:46:57 +01004286 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004287 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004288 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004289 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004290 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004291 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004292 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004293 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004294 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004295
4296 /* Copy back the bytes variables, which might have been modified by the
4297 callback */
4298 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4299 if (!inputobj)
4300 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004301 *input = PyBytes_AS_STRING(inputobj);
4302 insize = PyBytes_GET_SIZE(inputobj);
4303 *inend = *input + insize;
4304 /* we can DECREF safely, as the exception has another reference,
4305 so the object won't go away. */
4306 Py_DECREF(inputobj);
4307
4308 if (newpos<0)
4309 newpos = insize+newpos;
4310 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004311 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004312 goto onError;
4313 }
4314
4315 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4316 if (repwstr == NULL)
4317 goto onError;
4318 /* need more space? (at least enough for what we
4319 have+the replacement+the rest of the string (starting
4320 at the new input position), so we won't have to check space
4321 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004322 requiredsize = *outpos;
4323 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4324 goto overflow;
4325 requiredsize += repwlen;
4326 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4327 goto overflow;
4328 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004329 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004330 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004331 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004332 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004333 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004334 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004335 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004336 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004337 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004338 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004339 *endinpos = newpos;
4340 *inptr = *input + newpos;
4341
4342 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004343 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004344 return 0;
4345
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004346 overflow:
4347 PyErr_SetString(PyExc_OverflowError,
4348 "decoded result is too long for a Python string");
4349
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004350 onError:
4351 Py_XDECREF(restuple);
4352 return -1;
4353}
Steve Dowercc16be82016-09-08 10:35:16 -07004354#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004355
4356static int
4357unicode_decode_call_errorhandler_writer(
4358 const char *errors, PyObject **errorHandler,
4359 const char *encoding, const char *reason,
4360 const char **input, const char **inend, Py_ssize_t *startinpos,
4361 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4362 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4363{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004364 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004365
4366 PyObject *restuple = NULL;
4367 PyObject *repunicode = NULL;
4368 Py_ssize_t insize;
4369 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004370 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004371 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004372 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004373 int need_to_grow = 0;
4374 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004375
4376 if (*errorHandler == NULL) {
4377 *errorHandler = PyCodec_LookupError(errors);
4378 if (*errorHandler == NULL)
4379 goto onError;
4380 }
4381
4382 make_decode_exception(exceptionObject,
4383 encoding,
4384 *input, *inend - *input,
4385 *startinpos, *endinpos,
4386 reason);
4387 if (*exceptionObject == NULL)
4388 goto onError;
4389
Petr Viktorinffd97532020-02-11 17:46:57 +01004390 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004391 if (restuple == NULL)
4392 goto onError;
4393 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004394 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004395 goto onError;
4396 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004397 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004398 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004399
4400 /* Copy back the bytes variables, which might have been modified by the
4401 callback */
4402 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4403 if (!inputobj)
4404 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004405 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004406 *input = PyBytes_AS_STRING(inputobj);
4407 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004408 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004409 /* we can DECREF safely, as the exception has another reference,
4410 so the object won't go away. */
4411 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004412
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004413 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004414 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004415 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004416 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004417 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004418 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004419
Victor Stinner170ca6f2013-04-18 00:25:28 +02004420 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004421 if (replen > 1) {
4422 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004423 need_to_grow = 1;
4424 }
4425 new_inptr = *input + newpos;
4426 if (*inend - new_inptr > remain) {
4427 /* We don't know the decoding algorithm here so we make the worst
4428 assumption that one byte decodes to one unicode character.
4429 If unfortunately one byte could decode to more unicode characters,
4430 the decoder may write out-of-bound then. Is it possible for the
4431 algorithms using this function? */
4432 writer->min_length += *inend - new_inptr - remain;
4433 need_to_grow = 1;
4434 }
4435 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004436 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004437 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004438 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4439 goto onError;
4440 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004441 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004442 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004443
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004444 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004445 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004446
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004447 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004448 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004449 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004450
Benjamin Peterson29060642009-01-31 22:14:21 +00004451 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004452 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004453 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004454}
4455
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004456/* --- UTF-7 Codec -------------------------------------------------------- */
4457
Antoine Pitrou244651a2009-05-04 18:56:13 +00004458/* See RFC2152 for details. We encode conservatively and decode liberally. */
4459
4460/* Three simple macros defining base-64. */
4461
4462/* Is c a base-64 character? */
4463
4464#define IS_BASE64(c) \
4465 (((c) >= 'A' && (c) <= 'Z') || \
4466 ((c) >= 'a' && (c) <= 'z') || \
4467 ((c) >= '0' && (c) <= '9') || \
4468 (c) == '+' || (c) == '/')
4469
4470/* given that c is a base-64 character, what is its base-64 value? */
4471
4472#define FROM_BASE64(c) \
4473 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4474 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4475 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4476 (c) == '+' ? 62 : 63)
4477
4478/* What is the base-64 character of the bottom 6 bits of n? */
4479
4480#define TO_BASE64(n) \
4481 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4482
4483/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4484 * decoded as itself. We are permissive on decoding; the only ASCII
4485 * byte not decoding to itself is the + which begins a base64
4486 * string. */
4487
4488#define DECODE_DIRECT(c) \
4489 ((c) <= 127 && (c) != '+')
4490
4491/* The UTF-7 encoder treats ASCII characters differently according to
4492 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4493 * the above). See RFC2152. This array identifies these different
4494 * sets:
4495 * 0 : "Set D"
4496 * alphanumeric and '(),-./:?
4497 * 1 : "Set O"
4498 * !"#$%&*;<=>@[]^_`{|}
4499 * 2 : "whitespace"
4500 * ht nl cr sp
4501 * 3 : special (must be base64 encoded)
4502 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4503 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004504
Tim Petersced69f82003-09-16 20:30:58 +00004505static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004506char utf7_category[128] = {
4507/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4508 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4509/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4510 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4511/* sp ! " # $ % & ' ( ) * + , - . / */
4512 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4513/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4514 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4515/* @ A B C D E F G H I J K L M N O */
4516 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4517/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4518 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4519/* ` a b c d e f g h i j k l m n o */
4520 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4521/* p q r s t u v w x y z { | } ~ del */
4522 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004523};
4524
Antoine Pitrou244651a2009-05-04 18:56:13 +00004525/* ENCODE_DIRECT: this character should be encoded as itself. The
4526 * answer depends on whether we are encoding set O as itself, and also
4527 * on whether we are encoding whitespace as itself. RFC2152 makes it
4528 * clear that the answers to these questions vary between
4529 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004530
Antoine Pitrou244651a2009-05-04 18:56:13 +00004531#define ENCODE_DIRECT(c, directO, directWS) \
4532 ((c) < 128 && (c) > 0 && \
4533 ((utf7_category[(c)] == 0) || \
4534 (directWS && (utf7_category[(c)] == 2)) || \
4535 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004536
Alexander Belopolsky40018472011-02-26 01:02:56 +00004537PyObject *
4538PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004539 Py_ssize_t size,
4540 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004541{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004542 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4543}
4544
Antoine Pitrou244651a2009-05-04 18:56:13 +00004545/* The decoder. The only state we preserve is our read position,
4546 * i.e. how many characters we have consumed. So if we end in the
4547 * middle of a shift sequence we have to back off the read position
4548 * and the output to the beginning of the sequence, otherwise we lose
4549 * all the shift state (seen bits, number of bits seen, high
4550 * surrogate). */
4551
Alexander Belopolsky40018472011-02-26 01:02:56 +00004552PyObject *
4553PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004554 Py_ssize_t size,
4555 const char *errors,
4556 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004557{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004559 Py_ssize_t startinpos;
4560 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004561 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004562 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004563 const char *errmsg = "";
4564 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004565 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004566 unsigned int base64bits = 0;
4567 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004568 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004569 PyObject *errorHandler = NULL;
4570 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004571
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004572 if (size == 0) {
4573 if (consumed)
4574 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004575 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004576 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004577
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004578 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004579 _PyUnicodeWriter_Init(&writer);
4580 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004581
4582 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004583 e = s + size;
4584
4585 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004586 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004587 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004588 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004589
Antoine Pitrou244651a2009-05-04 18:56:13 +00004590 if (inShift) { /* in a base-64 section */
4591 if (IS_BASE64(ch)) { /* consume a base-64 character */
4592 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4593 base64bits += 6;
4594 s++;
4595 if (base64bits >= 16) {
4596 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004597 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004598 base64bits -= 16;
4599 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004600 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004601 if (surrogate) {
4602 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004603 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4604 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004605 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004606 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004607 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004608 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004609 }
4610 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004611 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004612 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004613 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004614 }
4615 }
Victor Stinner551ac952011-11-29 22:58:13 +01004616 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004617 /* first surrogate */
4618 surrogate = outCh;
4619 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004620 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004621 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004622 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004623 }
4624 }
4625 }
4626 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004627 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004628 if (base64bits > 0) { /* left-over bits */
4629 if (base64bits >= 6) {
4630 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004631 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004632 errmsg = "partial character in shift sequence";
4633 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004634 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004635 else {
4636 /* Some bits remain; they should be zero */
4637 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004638 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004639 errmsg = "non-zero padding bits in shift sequence";
4640 goto utf7Error;
4641 }
4642 }
4643 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004644 if (surrogate && DECODE_DIRECT(ch)) {
4645 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4646 goto onError;
4647 }
4648 surrogate = 0;
4649 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004650 /* '-' is absorbed; other terminating
4651 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004652 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004653 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004654 }
4655 }
4656 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004657 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004658 s++; /* consume '+' */
4659 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004660 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004661 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004662 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004663 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004664 else if (s < e && !IS_BASE64(*s)) {
4665 s++;
4666 errmsg = "ill-formed sequence";
4667 goto utf7Error;
4668 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004669 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004670 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004671 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004672 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004673 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004674 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004675 }
4676 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004677 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004678 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004679 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004680 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004681 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004682 else {
4683 startinpos = s-starts;
4684 s++;
4685 errmsg = "unexpected special character";
4686 goto utf7Error;
4687 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004688 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004689utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004690 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004691 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004692 errors, &errorHandler,
4693 "utf7", errmsg,
4694 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004695 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004696 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004697 }
4698
Antoine Pitrou244651a2009-05-04 18:56:13 +00004699 /* end of string */
4700
4701 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4702 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004703 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004704 if (surrogate ||
4705 (base64bits >= 6) ||
4706 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004707 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004708 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004709 errors, &errorHandler,
4710 "utf7", "unterminated shift sequence",
4711 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004712 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004713 goto onError;
4714 if (s < e)
4715 goto restart;
4716 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004717 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004718
4719 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004720 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004721 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004722 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004723 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004724 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004725 writer.kind, writer.data, shiftOutStart);
4726 Py_XDECREF(errorHandler);
4727 Py_XDECREF(exc);
4728 _PyUnicodeWriter_Dealloc(&writer);
4729 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004730 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004731 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004732 }
4733 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004734 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004735 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004736 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004737
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004738 Py_XDECREF(errorHandler);
4739 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004740 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004741
Benjamin Peterson29060642009-01-31 22:14:21 +00004742 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004743 Py_XDECREF(errorHandler);
4744 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004745 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004746 return NULL;
4747}
4748
4749
Alexander Belopolsky40018472011-02-26 01:02:56 +00004750PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004751_PyUnicode_EncodeUTF7(PyObject *str,
4752 int base64SetO,
4753 int base64WhiteSpace,
4754 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004755{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004756 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004757 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004758 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004759 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004760 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004761 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004762 unsigned int base64bits = 0;
4763 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004764 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004765 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004766
Benjamin Petersonbac79492012-01-14 13:34:47 -05004767 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004768 return NULL;
4769 kind = PyUnicode_KIND(str);
4770 data = PyUnicode_DATA(str);
4771 len = PyUnicode_GET_LENGTH(str);
4772
4773 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004774 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004775
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004776 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004777 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004778 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004779 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004780 if (v == NULL)
4781 return NULL;
4782
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004783 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004784 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004785 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004786
Antoine Pitrou244651a2009-05-04 18:56:13 +00004787 if (inShift) {
4788 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4789 /* shifting out */
4790 if (base64bits) { /* output remaining bits */
4791 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4792 base64buffer = 0;
4793 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004794 }
4795 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004796 /* Characters not in the BASE64 set implicitly unshift the sequence
4797 so no '-' is required, except if the character is itself a '-' */
4798 if (IS_BASE64(ch) || ch == '-') {
4799 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004800 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004801 *out++ = (char) ch;
4802 }
4803 else {
4804 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004805 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004806 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004807 else { /* not in a shift sequence */
4808 if (ch == '+') {
4809 *out++ = '+';
4810 *out++ = '-';
4811 }
4812 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4813 *out++ = (char) ch;
4814 }
4815 else {
4816 *out++ = '+';
4817 inShift = 1;
4818 goto encode_char;
4819 }
4820 }
4821 continue;
4822encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004823 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004824 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004825
Antoine Pitrou244651a2009-05-04 18:56:13 +00004826 /* code first surrogate */
4827 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004828 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004829 while (base64bits >= 6) {
4830 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4831 base64bits -= 6;
4832 }
4833 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004834 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004835 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004836 base64bits += 16;
4837 base64buffer = (base64buffer << 16) | ch;
4838 while (base64bits >= 6) {
4839 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4840 base64bits -= 6;
4841 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004842 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004843 if (base64bits)
4844 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4845 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004846 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004847 if (_PyBytes_Resize(&v, out - start) < 0)
4848 return NULL;
4849 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004850}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004851PyObject *
4852PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4853 Py_ssize_t size,
4854 int base64SetO,
4855 int base64WhiteSpace,
4856 const char *errors)
4857{
4858 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004859 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004860 if (tmp == NULL)
4861 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004862 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004863 base64WhiteSpace, errors);
4864 Py_DECREF(tmp);
4865 return result;
4866}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004867
Antoine Pitrou244651a2009-05-04 18:56:13 +00004868#undef IS_BASE64
4869#undef FROM_BASE64
4870#undef TO_BASE64
4871#undef DECODE_DIRECT
4872#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004873
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874/* --- UTF-8 Codec -------------------------------------------------------- */
4875
Alexander Belopolsky40018472011-02-26 01:02:56 +00004876PyObject *
4877PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004878 Py_ssize_t size,
4879 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880{
Walter Dörwald69652032004-09-07 20:24:22 +00004881 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4882}
4883
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004884#include "stringlib/asciilib.h"
4885#include "stringlib/codecs.h"
4886#include "stringlib/undef.h"
4887
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004888#include "stringlib/ucs1lib.h"
4889#include "stringlib/codecs.h"
4890#include "stringlib/undef.h"
4891
4892#include "stringlib/ucs2lib.h"
4893#include "stringlib/codecs.h"
4894#include "stringlib/undef.h"
4895
4896#include "stringlib/ucs4lib.h"
4897#include "stringlib/codecs.h"
4898#include "stringlib/undef.h"
4899
Antoine Pitrouab868312009-01-10 15:40:25 +00004900/* Mask to quickly check whether a C 'long' contains a
4901 non-ASCII, UTF8-encoded char. */
4902#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004903# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004904#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004905# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004906#else
4907# error C 'long' size should be either 4 or 8!
4908#endif
4909
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004910static Py_ssize_t
4911ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004912{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004913 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004914 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004915
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004916 /*
4917 * Issue #17237: m68k is a bit different from most architectures in
4918 * that objects do not use "natural alignment" - for example, int and
4919 * long are only aligned at 2-byte boundaries. Therefore the assert()
4920 * won't work; also, tests have shown that skipping the "optimised
4921 * version" will even speed up m68k.
4922 */
4923#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004924#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004925 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4926 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004927 /* Fast path, see in STRINGLIB(utf8_decode) for
4928 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004929 /* Help allocation */
4930 const char *_p = p;
4931 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004932 while (_p < aligned_end) {
4933 unsigned long value = *(const unsigned long *) _p;
4934 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004935 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004936 *((unsigned long *)q) = value;
4937 _p += SIZEOF_LONG;
4938 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004939 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004940 p = _p;
4941 while (p < end) {
4942 if ((unsigned char)*p & 0x80)
4943 break;
4944 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004945 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004946 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004947 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004948#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004949#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004950 while (p < end) {
4951 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4952 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004953 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004954 /* Help allocation */
4955 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004956 while (_p < aligned_end) {
Andy Lestere6be9b52020-02-11 20:28:35 -06004957 unsigned long value = *(const unsigned long *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004958 if (value & ASCII_CHAR_MASK)
4959 break;
4960 _p += SIZEOF_LONG;
4961 }
4962 p = _p;
4963 if (_p == end)
4964 break;
4965 }
4966 if ((unsigned char)*p & 0x80)
4967 break;
4968 ++p;
4969 }
4970 memcpy(dest, start, p - start);
4971 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004972}
Antoine Pitrouab868312009-01-10 15:40:25 +00004973
Victor Stinner709d23d2019-05-02 14:56:30 -04004974static PyObject *
4975unicode_decode_utf8(const char *s, Py_ssize_t size,
4976 _Py_error_handler error_handler, const char *errors,
4977 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004978{
Victor Stinner785938e2011-12-11 20:09:03 +01004979 if (size == 0) {
4980 if (consumed)
4981 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004982 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004983 }
4984
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004985 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4986 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004987 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004988 *consumed = 1;
4989 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004990 }
4991
Inada Naoki770847a2019-06-24 12:30:24 +09004992 const char *starts = s;
4993 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01004994
Inada Naoki770847a2019-06-24 12:30:24 +09004995 // fast path: try ASCII string.
4996 PyObject *u = PyUnicode_New(size, 127);
4997 if (u == NULL) {
4998 return NULL;
4999 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005000 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09005001 if (s == end) {
5002 return u;
5003 }
5004
5005 // Use _PyUnicodeWriter after fast path is failed.
5006 _PyUnicodeWriter writer;
5007 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5008 writer.pos = s - starts;
5009
5010 Py_ssize_t startinpos, endinpos;
5011 const char *errmsg = "";
5012 PyObject *error_handler_obj = NULL;
5013 PyObject *exc = NULL;
5014
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005015 while (s < end) {
5016 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005017 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005018
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005019 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005020 if (PyUnicode_IS_ASCII(writer.buffer))
5021 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005022 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005023 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005024 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005025 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005026 } else {
5027 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005028 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005029 }
5030
5031 switch (ch) {
5032 case 0:
5033 if (s == end || consumed)
5034 goto End;
5035 errmsg = "unexpected end of data";
5036 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005037 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005038 break;
5039 case 1:
5040 errmsg = "invalid start byte";
5041 startinpos = s - starts;
5042 endinpos = startinpos + 1;
5043 break;
5044 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005045 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5046 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5047 {
5048 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005049 goto End;
5050 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005051 /* fall through */
5052 case 3:
5053 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005054 errmsg = "invalid continuation byte";
5055 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005056 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005057 break;
5058 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005059 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005060 goto onError;
5061 continue;
5062 }
5063
Victor Stinner1d65d912015-10-05 13:43:50 +02005064 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005065 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005066
5067 switch (error_handler) {
5068 case _Py_ERROR_IGNORE:
5069 s += (endinpos - startinpos);
5070 break;
5071
5072 case _Py_ERROR_REPLACE:
5073 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5074 goto onError;
5075 s += (endinpos - startinpos);
5076 break;
5077
5078 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005079 {
5080 Py_ssize_t i;
5081
Victor Stinner1d65d912015-10-05 13:43:50 +02005082 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5083 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005084 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005085 ch = (Py_UCS4)(unsigned char)(starts[i]);
5086 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5087 ch + 0xdc00);
5088 writer.pos++;
5089 }
5090 s += (endinpos - startinpos);
5091 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005092 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005093
5094 default:
5095 if (unicode_decode_call_errorhandler_writer(
5096 errors, &error_handler_obj,
5097 "utf-8", errmsg,
5098 &starts, &end, &startinpos, &endinpos, &exc, &s,
5099 &writer))
5100 goto onError;
5101 }
Victor Stinner785938e2011-12-11 20:09:03 +01005102 }
5103
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005104End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005105 if (consumed)
5106 *consumed = s - starts;
5107
Victor Stinner1d65d912015-10-05 13:43:50 +02005108 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005109 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005110 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005111
5112onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005113 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005114 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005115 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005116 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005117}
5118
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005119
Victor Stinner709d23d2019-05-02 14:56:30 -04005120PyObject *
5121PyUnicode_DecodeUTF8Stateful(const char *s,
5122 Py_ssize_t size,
5123 const char *errors,
5124 Py_ssize_t *consumed)
5125{
5126 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5127}
5128
5129
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005130/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5131 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005132
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005133 On success, write a pointer to a newly allocated wide character string into
5134 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5135 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005136
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005137 On memory allocation failure, return -1.
5138
5139 On decoding error (if surrogateescape is zero), return -2. If wlen is
5140 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5141 is not NULL, write the decoding error message into *reason. */
5142int
5143_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005144 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005145{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005146 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005147 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005148 wchar_t *unicode;
5149 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005150
Victor Stinner3d4226a2018-08-29 22:21:32 +02005151 int surrogateescape = 0;
5152 int surrogatepass = 0;
5153 switch (errors)
5154 {
5155 case _Py_ERROR_STRICT:
5156 break;
5157 case _Py_ERROR_SURROGATEESCAPE:
5158 surrogateescape = 1;
5159 break;
5160 case _Py_ERROR_SURROGATEPASS:
5161 surrogatepass = 1;
5162 break;
5163 default:
5164 return -3;
5165 }
5166
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005167 /* Note: size will always be longer than the resulting Unicode
5168 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005169 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005170 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005171 }
5172
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005173 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005174 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005175 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005176 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005177
5178 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005179 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005180 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005181 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005182 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005183#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005184 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005185#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005186 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005187#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005188 if (ch > 0xFF) {
5189#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005190 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005191#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005192 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005193 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005194 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5195 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5196#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005197 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005198 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005199 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005200 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005201 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005202
5203 if (surrogateescape) {
5204 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5205 }
5206 else {
5207 /* Is it a valid three-byte code? */
5208 if (surrogatepass
5209 && (e - s) >= 3
5210 && (s[0] & 0xf0) == 0xe0
5211 && (s[1] & 0xc0) == 0x80
5212 && (s[2] & 0xc0) == 0x80)
5213 {
5214 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5215 s += 3;
5216 unicode[outpos++] = ch;
5217 }
5218 else {
5219 PyMem_RawFree(unicode );
5220 if (reason != NULL) {
5221 switch (ch) {
5222 case 0:
5223 *reason = "unexpected end of data";
5224 break;
5225 case 1:
5226 *reason = "invalid start byte";
5227 break;
5228 /* 2, 3, 4 */
5229 default:
5230 *reason = "invalid continuation byte";
5231 break;
5232 }
5233 }
5234 if (wlen != NULL) {
5235 *wlen = s - orig_s;
5236 }
5237 return -2;
5238 }
5239 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005240 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005241 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005242 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005243 if (wlen) {
5244 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005245 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005246 *wstr = unicode;
5247 return 0;
5248}
5249
Victor Stinner5f9cf232019-03-19 01:46:25 +01005250
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005251wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005252_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5253 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005254{
5255 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005256 int res = _Py_DecodeUTF8Ex(arg, arglen,
5257 &wstr, wlen,
5258 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005259 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005260 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5261 assert(res != -3);
5262 if (wlen) {
5263 *wlen = (size_t)res;
5264 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005265 return NULL;
5266 }
5267 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005268}
5269
Antoine Pitrouab868312009-01-10 15:40:25 +00005270
Victor Stinnere47e6982017-12-21 15:45:16 +01005271/* UTF-8 encoder using the surrogateescape error handler .
5272
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005273 On success, return 0 and write the newly allocated character string (use
5274 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005275
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005276 On encoding failure, return -2 and write the position of the invalid
5277 surrogate character into *error_pos (if error_pos is set) and the decoding
5278 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005279
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005280 On memory allocation failure, return -1. */
5281int
5282_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005283 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005284{
5285 const Py_ssize_t max_char_size = 4;
5286 Py_ssize_t len = wcslen(text);
5287
5288 assert(len >= 0);
5289
Victor Stinner3d4226a2018-08-29 22:21:32 +02005290 int surrogateescape = 0;
5291 int surrogatepass = 0;
5292 switch (errors)
5293 {
5294 case _Py_ERROR_STRICT:
5295 break;
5296 case _Py_ERROR_SURROGATEESCAPE:
5297 surrogateescape = 1;
5298 break;
5299 case _Py_ERROR_SURROGATEPASS:
5300 surrogatepass = 1;
5301 break;
5302 default:
5303 return -3;
5304 }
5305
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005306 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5307 return -1;
5308 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005309 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005310 if (raw_malloc) {
5311 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005312 }
5313 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005314 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005315 }
5316 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005317 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005318 }
5319
5320 char *p = bytes;
5321 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005322 for (i = 0; i < len; ) {
5323 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005324 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005325 i++;
5326#if Py_UNICODE_SIZE == 2
5327 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5328 && i < len
5329 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5330 {
5331 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5332 i++;
5333 }
5334#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005335
5336 if (ch < 0x80) {
5337 /* Encode ASCII */
5338 *p++ = (char) ch;
5339
5340 }
5341 else if (ch < 0x0800) {
5342 /* Encode Latin-1 */
5343 *p++ = (char)(0xc0 | (ch >> 6));
5344 *p++ = (char)(0x80 | (ch & 0x3f));
5345 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005346 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005347 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005348 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005349 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005350 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005351 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005352 if (reason != NULL) {
5353 *reason = "encoding error";
5354 }
5355 if (raw_malloc) {
5356 PyMem_RawFree(bytes);
5357 }
5358 else {
5359 PyMem_Free(bytes);
5360 }
5361 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005362 }
5363 *p++ = (char)(ch & 0xff);
5364 }
5365 else if (ch < 0x10000) {
5366 *p++ = (char)(0xe0 | (ch >> 12));
5367 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5368 *p++ = (char)(0x80 | (ch & 0x3f));
5369 }
5370 else { /* ch >= 0x10000 */
5371 assert(ch <= MAX_UNICODE);
5372 /* Encode UCS4 Unicode ordinals */
5373 *p++ = (char)(0xf0 | (ch >> 18));
5374 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5375 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5376 *p++ = (char)(0x80 | (ch & 0x3f));
5377 }
5378 }
5379 *p++ = '\0';
5380
5381 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005382 char *bytes2;
5383 if (raw_malloc) {
5384 bytes2 = PyMem_RawRealloc(bytes, final_size);
5385 }
5386 else {
5387 bytes2 = PyMem_Realloc(bytes, final_size);
5388 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005389 if (bytes2 == NULL) {
5390 if (error_pos != NULL) {
5391 *error_pos = (size_t)-1;
5392 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005393 if (raw_malloc) {
5394 PyMem_RawFree(bytes);
5395 }
5396 else {
5397 PyMem_Free(bytes);
5398 }
5399 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005400 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005401 *str = bytes2;
5402 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005403}
5404
5405
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005406/* Primary internal function which creates utf8 encoded bytes objects.
5407
5408 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005409 and allocate exactly as much space needed at the end. Else allocate the
5410 maximum possible needed (4 result bytes per Unicode character), and return
5411 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005412*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005413static PyObject *
5414unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5415 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005417 if (!PyUnicode_Check(unicode)) {
5418 PyErr_BadArgument();
5419 return NULL;
5420 }
5421
5422 if (PyUnicode_READY(unicode) == -1)
5423 return NULL;
5424
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005425 if (PyUnicode_UTF8(unicode))
5426 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5427 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005428
Inada Naoki02a4d572020-02-27 13:48:59 +09005429 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005430 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005431 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5432
5433 _PyBytesWriter writer;
5434 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005435
Benjamin Petersonead6b532011-12-20 17:23:42 -06005436 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005437 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005438 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005439 case PyUnicode_1BYTE_KIND:
5440 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5441 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005442 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5443 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005444 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005445 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5446 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005447 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005448 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5449 break;
Tim Peters602f7402002-04-27 18:03:26 +00005450 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005451
5452 if (end == NULL) {
5453 _PyBytesWriter_Dealloc(&writer);
5454 return NULL;
5455 }
5456 return _PyBytesWriter_Finish(&writer, end);
5457}
5458
5459static int
5460unicode_fill_utf8(PyObject *unicode)
5461{
5462 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5463 assert(!PyUnicode_IS_ASCII(unicode));
5464
5465 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005466 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005467 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5468
5469 _PyBytesWriter writer;
5470 char *end;
5471
5472 switch (kind) {
5473 default:
5474 Py_UNREACHABLE();
5475 case PyUnicode_1BYTE_KIND:
5476 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5477 _Py_ERROR_STRICT, NULL);
5478 break;
5479 case PyUnicode_2BYTE_KIND:
5480 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5481 _Py_ERROR_STRICT, NULL);
5482 break;
5483 case PyUnicode_4BYTE_KIND:
5484 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5485 _Py_ERROR_STRICT, NULL);
5486 break;
5487 }
5488 if (end == NULL) {
5489 _PyBytesWriter_Dealloc(&writer);
5490 return -1;
5491 }
5492
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005493 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005494 PyBytes_AS_STRING(writer.buffer);
5495 Py_ssize_t len = end - start;
5496
5497 char *cache = PyObject_MALLOC(len + 1);
5498 if (cache == NULL) {
5499 _PyBytesWriter_Dealloc(&writer);
5500 PyErr_NoMemory();
5501 return -1;
5502 }
5503 _PyUnicode_UTF8(unicode) = cache;
5504 _PyUnicode_UTF8_LENGTH(unicode) = len;
5505 memcpy(cache, start, len);
5506 cache[len] = '\0';
5507 _PyBytesWriter_Dealloc(&writer);
5508 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509}
5510
Alexander Belopolsky40018472011-02-26 01:02:56 +00005511PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005512_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5513{
5514 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5515}
5516
5517
5518PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005519PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5520 Py_ssize_t size,
5521 const char *errors)
5522{
5523 PyObject *v, *unicode;
5524
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005525 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005526 if (unicode == NULL)
5527 return NULL;
5528 v = _PyUnicode_AsUTF8String(unicode, errors);
5529 Py_DECREF(unicode);
5530 return v;
5531}
5532
5533PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005534PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005535{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005536 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537}
5538
Walter Dörwald41980ca2007-08-16 21:55:45 +00005539/* --- UTF-32 Codec ------------------------------------------------------- */
5540
5541PyObject *
5542PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005543 Py_ssize_t size,
5544 const char *errors,
5545 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005546{
5547 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5548}
5549
5550PyObject *
5551PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005552 Py_ssize_t size,
5553 const char *errors,
5554 int *byteorder,
5555 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005556{
5557 const char *starts = s;
5558 Py_ssize_t startinpos;
5559 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005560 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005561 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005562 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005563 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005564 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005565 PyObject *errorHandler = NULL;
5566 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005567
Andy Lestere6be9b52020-02-11 20:28:35 -06005568 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005569 e = q + size;
5570
5571 if (byteorder)
5572 bo = *byteorder;
5573
5574 /* Check for BOM marks (U+FEFF) in the input and adjust current
5575 byte order setting accordingly. In native mode, the leading BOM
5576 mark is skipped, in all other modes, it is copied to the output
5577 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005578 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005579 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005580 if (bom == 0x0000FEFF) {
5581 bo = -1;
5582 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005583 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005584 else if (bom == 0xFFFE0000) {
5585 bo = 1;
5586 q += 4;
5587 }
5588 if (byteorder)
5589 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005590 }
5591
Victor Stinnere64322e2012-10-30 23:12:47 +01005592 if (q == e) {
5593 if (consumed)
5594 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005595 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005596 }
5597
Victor Stinnere64322e2012-10-30 23:12:47 +01005598#ifdef WORDS_BIGENDIAN
5599 le = bo < 0;
5600#else
5601 le = bo <= 0;
5602#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005603 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005604
Victor Stinner8f674cc2013-04-17 23:02:17 +02005605 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005606 writer.min_length = (e - q + 3) / 4;
5607 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005608 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005609
Victor Stinnere64322e2012-10-30 23:12:47 +01005610 while (1) {
5611 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005612 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005613
Victor Stinnere64322e2012-10-30 23:12:47 +01005614 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005615 enum PyUnicode_Kind kind = writer.kind;
5616 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005617 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005618 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005619 if (le) {
5620 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005621 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005622 if (ch > maxch)
5623 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005624 if (kind != PyUnicode_1BYTE_KIND &&
5625 Py_UNICODE_IS_SURROGATE(ch))
5626 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005627 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005628 q += 4;
5629 } while (q <= last);
5630 }
5631 else {
5632 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005633 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005634 if (ch > maxch)
5635 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005636 if (kind != PyUnicode_1BYTE_KIND &&
5637 Py_UNICODE_IS_SURROGATE(ch))
5638 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005639 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005640 q += 4;
5641 } while (q <= last);
5642 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005643 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005644 }
5645
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005646 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005647 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005648 startinpos = ((const char *)q) - starts;
5649 endinpos = startinpos + 4;
5650 }
5651 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005652 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005653 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005654 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005655 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005656 startinpos = ((const char *)q) - starts;
5657 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005658 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005659 else {
5660 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005661 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005662 goto onError;
5663 q += 4;
5664 continue;
5665 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005666 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005667 startinpos = ((const char *)q) - starts;
5668 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005669 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005670
5671 /* The remaining input chars are ignored if the callback
5672 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005673 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005675 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005677 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005678 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005679 }
5680
Walter Dörwald41980ca2007-08-16 21:55:45 +00005681 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005682 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005683
Walter Dörwald41980ca2007-08-16 21:55:45 +00005684 Py_XDECREF(errorHandler);
5685 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005686 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005687
Benjamin Peterson29060642009-01-31 22:14:21 +00005688 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005689 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005690 Py_XDECREF(errorHandler);
5691 Py_XDECREF(exc);
5692 return NULL;
5693}
5694
5695PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005696_PyUnicode_EncodeUTF32(PyObject *str,
5697 const char *errors,
5698 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005699{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005700 enum PyUnicode_Kind kind;
5701 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005702 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005703 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005704 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005705#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005706 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005707#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005708 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005709#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005710 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005711 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005712 PyObject *errorHandler = NULL;
5713 PyObject *exc = NULL;
5714 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005715
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005716 if (!PyUnicode_Check(str)) {
5717 PyErr_BadArgument();
5718 return NULL;
5719 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005720 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005721 return NULL;
5722 kind = PyUnicode_KIND(str);
5723 data = PyUnicode_DATA(str);
5724 len = PyUnicode_GET_LENGTH(str);
5725
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005726 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005727 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005728 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005729 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005730 if (v == NULL)
5731 return NULL;
5732
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005733 /* output buffer is 4-bytes aligned */
5734 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005735 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005736 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005737 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005738 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005739 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005740
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005741 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005742 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005743 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005744 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005745 else
5746 encoding = "utf-32";
5747
5748 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005749 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5750 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005751 }
5752
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005753 pos = 0;
5754 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005755 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005756
5757 if (kind == PyUnicode_2BYTE_KIND) {
5758 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5759 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005760 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005761 else {
5762 assert(kind == PyUnicode_4BYTE_KIND);
5763 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5764 &out, native_ordering);
5765 }
5766 if (pos == len)
5767 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005768
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005769 rep = unicode_encode_call_errorhandler(
5770 errors, &errorHandler,
5771 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005772 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005773 if (!rep)
5774 goto error;
5775
5776 if (PyBytes_Check(rep)) {
5777 repsize = PyBytes_GET_SIZE(rep);
5778 if (repsize & 3) {
5779 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005780 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005781 "surrogates not allowed");
5782 goto error;
5783 }
5784 moreunits = repsize / 4;
5785 }
5786 else {
5787 assert(PyUnicode_Check(rep));
5788 if (PyUnicode_READY(rep) < 0)
5789 goto error;
5790 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5791 if (!PyUnicode_IS_ASCII(rep)) {
5792 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005793 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005794 "surrogates not allowed");
5795 goto error;
5796 }
5797 }
5798
5799 /* four bytes are reserved for each surrogate */
5800 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005801 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005802 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005803 /* integer overflow */
5804 PyErr_NoMemory();
5805 goto error;
5806 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005807 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005808 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005809 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005810 }
5811
5812 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005813 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005814 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005815 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005816 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005817 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5818 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005819 }
5820
5821 Py_CLEAR(rep);
5822 }
5823
5824 /* Cut back to size actually needed. This is necessary for, for example,
5825 encoding of a string containing isolated surrogates and the 'ignore'
5826 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005827 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005828 if (nsize != PyBytes_GET_SIZE(v))
5829 _PyBytes_Resize(&v, nsize);
5830 Py_XDECREF(errorHandler);
5831 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005832 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005833 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005834 error:
5835 Py_XDECREF(rep);
5836 Py_XDECREF(errorHandler);
5837 Py_XDECREF(exc);
5838 Py_XDECREF(v);
5839 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005840}
5841
Alexander Belopolsky40018472011-02-26 01:02:56 +00005842PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005843PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5844 Py_ssize_t size,
5845 const char *errors,
5846 int byteorder)
5847{
5848 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005849 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005850 if (tmp == NULL)
5851 return NULL;
5852 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5853 Py_DECREF(tmp);
5854 return result;
5855}
5856
5857PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005858PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005859{
Victor Stinnerb960b342011-11-20 19:12:52 +01005860 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005861}
5862
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863/* --- UTF-16 Codec ------------------------------------------------------- */
5864
Tim Peters772747b2001-08-09 22:21:55 +00005865PyObject *
5866PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005867 Py_ssize_t size,
5868 const char *errors,
5869 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870{
Walter Dörwald69652032004-09-07 20:24:22 +00005871 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5872}
5873
5874PyObject *
5875PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005876 Py_ssize_t size,
5877 const char *errors,
5878 int *byteorder,
5879 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005880{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005881 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005882 Py_ssize_t startinpos;
5883 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005884 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005885 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005886 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005887 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005888 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005889 PyObject *errorHandler = NULL;
5890 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005891 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892
Andy Lestere6be9b52020-02-11 20:28:35 -06005893 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005894 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895
5896 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005897 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005899 /* Check for BOM marks (U+FEFF) in the input and adjust current
5900 byte order setting accordingly. In native mode, the leading BOM
5901 mark is skipped, in all other modes, it is copied to the output
5902 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005903 if (bo == 0 && size >= 2) {
5904 const Py_UCS4 bom = (q[1] << 8) | q[0];
5905 if (bom == 0xFEFF) {
5906 q += 2;
5907 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005908 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005909 else if (bom == 0xFFFE) {
5910 q += 2;
5911 bo = 1;
5912 }
5913 if (byteorder)
5914 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005915 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916
Antoine Pitrou63065d72012-05-15 23:48:04 +02005917 if (q == e) {
5918 if (consumed)
5919 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005920 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005921 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005922
Christian Heimes743e0cd2012-10-17 23:52:17 +02005923#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005924 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005925 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005926#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005927 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005928 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005929#endif
Tim Peters772747b2001-08-09 22:21:55 +00005930
Antoine Pitrou63065d72012-05-15 23:48:04 +02005931 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005932 character count normally. Error handler will take care of
5933 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005934 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005935 writer.min_length = (e - q + 1) / 2;
5936 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005937 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005938
Antoine Pitrou63065d72012-05-15 23:48:04 +02005939 while (1) {
5940 Py_UCS4 ch = 0;
5941 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005942 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005943 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005944 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005945 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005946 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005947 native_ordering);
5948 else
5949 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005950 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005951 native_ordering);
5952 } else if (kind == PyUnicode_2BYTE_KIND) {
5953 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005954 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005955 native_ordering);
5956 } else {
5957 assert(kind == PyUnicode_4BYTE_KIND);
5958 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005959 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005960 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005961 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005962 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005963
Antoine Pitrou63065d72012-05-15 23:48:04 +02005964 switch (ch)
5965 {
5966 case 0:
5967 /* remaining byte at the end? (size should be even) */
5968 if (q == e || consumed)
5969 goto End;
5970 errmsg = "truncated data";
5971 startinpos = ((const char *)q) - starts;
5972 endinpos = ((const char *)e) - starts;
5973 break;
5974 /* The remaining input chars are ignored if the callback
5975 chooses to skip the input */
5976 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005977 q -= 2;
5978 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005979 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005980 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005981 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005982 endinpos = ((const char *)e) - starts;
5983 break;
5984 case 2:
5985 errmsg = "illegal encoding";
5986 startinpos = ((const char *)q) - 2 - starts;
5987 endinpos = startinpos + 2;
5988 break;
5989 case 3:
5990 errmsg = "illegal UTF-16 surrogate";
5991 startinpos = ((const char *)q) - 4 - starts;
5992 endinpos = startinpos + 2;
5993 break;
5994 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005995 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005996 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005997 continue;
5998 }
5999
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006000 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00006001 errors,
6002 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006003 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00006004 &starts,
6005 (const char **)&e,
6006 &startinpos,
6007 &endinpos,
6008 &exc,
6009 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006010 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006011 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 }
6013
Antoine Pitrou63065d72012-05-15 23:48:04 +02006014End:
Walter Dörwald69652032004-09-07 20:24:22 +00006015 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006016 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00006017
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006018 Py_XDECREF(errorHandler);
6019 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006020 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006023 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006024 Py_XDECREF(errorHandler);
6025 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026 return NULL;
6027}
6028
Tim Peters772747b2001-08-09 22:21:55 +00006029PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006030_PyUnicode_EncodeUTF16(PyObject *str,
6031 const char *errors,
6032 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006034 enum PyUnicode_Kind kind;
6035 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006036 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006037 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006038 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006039 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02006040#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006041 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006042#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006043 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006044#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006045 const char *encoding;
6046 Py_ssize_t nsize, pos;
6047 PyObject *errorHandler = NULL;
6048 PyObject *exc = NULL;
6049 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006050
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006051 if (!PyUnicode_Check(str)) {
6052 PyErr_BadArgument();
6053 return NULL;
6054 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006055 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006056 return NULL;
6057 kind = PyUnicode_KIND(str);
6058 data = PyUnicode_DATA(str);
6059 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006060
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006061 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006062 if (kind == PyUnicode_4BYTE_KIND) {
6063 const Py_UCS4 *in = (const Py_UCS4 *)data;
6064 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006065 while (in < end) {
6066 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006067 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006068 }
6069 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006070 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006071 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006072 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006073 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006074 nsize = len + pairs + (byteorder == 0);
6075 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006076 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006078 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006080 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006081 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006082 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006083 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006084 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006085 }
6086 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006087 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006088 }
Tim Peters772747b2001-08-09 22:21:55 +00006089
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006090 if (kind == PyUnicode_1BYTE_KIND) {
6091 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6092 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006093 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006094
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006095 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006096 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006097 }
6098 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006099 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006100 }
6101 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006102 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006103 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006104
6105 pos = 0;
6106 while (pos < len) {
6107 Py_ssize_t repsize, moreunits;
6108
6109 if (kind == PyUnicode_2BYTE_KIND) {
6110 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6111 &out, native_ordering);
6112 }
6113 else {
6114 assert(kind == PyUnicode_4BYTE_KIND);
6115 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6116 &out, native_ordering);
6117 }
6118 if (pos == len)
6119 break;
6120
6121 rep = unicode_encode_call_errorhandler(
6122 errors, &errorHandler,
6123 encoding, "surrogates not allowed",
6124 str, &exc, pos, pos + 1, &pos);
6125 if (!rep)
6126 goto error;
6127
6128 if (PyBytes_Check(rep)) {
6129 repsize = PyBytes_GET_SIZE(rep);
6130 if (repsize & 1) {
6131 raise_encode_exception(&exc, encoding,
6132 str, pos - 1, pos,
6133 "surrogates not allowed");
6134 goto error;
6135 }
6136 moreunits = repsize / 2;
6137 }
6138 else {
6139 assert(PyUnicode_Check(rep));
6140 if (PyUnicode_READY(rep) < 0)
6141 goto error;
6142 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6143 if (!PyUnicode_IS_ASCII(rep)) {
6144 raise_encode_exception(&exc, encoding,
6145 str, pos - 1, pos,
6146 "surrogates not allowed");
6147 goto error;
6148 }
6149 }
6150
6151 /* two bytes are reserved for each surrogate */
6152 if (moreunits > 1) {
6153 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006154 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006155 /* integer overflow */
6156 PyErr_NoMemory();
6157 goto error;
6158 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006159 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006160 goto error;
6161 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6162 }
6163
6164 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006165 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006166 out += moreunits;
6167 } else /* rep is unicode */ {
6168 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6169 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6170 &out, native_ordering);
6171 }
6172
6173 Py_CLEAR(rep);
6174 }
6175
6176 /* Cut back to size actually needed. This is necessary for, for example,
6177 encoding of a string containing isolated surrogates and the 'ignore' handler
6178 is used. */
6179 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6180 if (nsize != PyBytes_GET_SIZE(v))
6181 _PyBytes_Resize(&v, nsize);
6182 Py_XDECREF(errorHandler);
6183 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006184 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006185 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006186 error:
6187 Py_XDECREF(rep);
6188 Py_XDECREF(errorHandler);
6189 Py_XDECREF(exc);
6190 Py_XDECREF(v);
6191 return NULL;
6192#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193}
6194
Alexander Belopolsky40018472011-02-26 01:02:56 +00006195PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006196PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6197 Py_ssize_t size,
6198 const char *errors,
6199 int byteorder)
6200{
6201 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006202 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006203 if (tmp == NULL)
6204 return NULL;
6205 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6206 Py_DECREF(tmp);
6207 return result;
6208}
6209
6210PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006211PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006213 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214}
6215
6216/* --- Unicode Escape Codec ----------------------------------------------- */
6217
Fredrik Lundh06d12682001-01-24 07:59:11 +00006218static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006219
Alexander Belopolsky40018472011-02-26 01:02:56 +00006220PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006221_PyUnicode_DecodeUnicodeEscape(const char *s,
6222 Py_ssize_t size,
6223 const char *errors,
6224 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006226 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006227 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006229 PyObject *errorHandler = NULL;
6230 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006231
Eric V. Smith42454af2016-10-31 09:22:08 -04006232 // so we can remember if we've seen an invalid escape char or not
6233 *first_invalid_escape = NULL;
6234
Victor Stinner62ec3312016-09-06 17:04:34 -07006235 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006236 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006237 }
6238 /* Escaped strings will always be longer than the resulting
6239 Unicode string, so we start with size here and then reduce the
6240 length after conversion to the true value.
6241 (but if the error callback returns a long replacement string
6242 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006243 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006244 writer.min_length = size;
6245 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6246 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006247 }
6248
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249 end = s + size;
6250 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006251 unsigned char c = (unsigned char) *s++;
6252 Py_UCS4 ch;
6253 int count;
6254 Py_ssize_t startinpos;
6255 Py_ssize_t endinpos;
6256 const char *message;
6257
6258#define WRITE_ASCII_CHAR(ch) \
6259 do { \
6260 assert(ch <= 127); \
6261 assert(writer.pos < writer.size); \
6262 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6263 } while(0)
6264
6265#define WRITE_CHAR(ch) \
6266 do { \
6267 if (ch <= writer.maxchar) { \
6268 assert(writer.pos < writer.size); \
6269 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6270 } \
6271 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6272 goto onError; \
6273 } \
6274 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275
6276 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006277 if (c != '\\') {
6278 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279 continue;
6280 }
6281
Victor Stinner62ec3312016-09-06 17:04:34 -07006282 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006284 if (s >= end) {
6285 message = "\\ at end of string";
6286 goto error;
6287 }
6288 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006289
Victor Stinner62ec3312016-09-06 17:04:34 -07006290 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006291 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292
Benjamin Peterson29060642009-01-31 22:14:21 +00006293 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006294 case '\n': continue;
6295 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6296 case '\'': WRITE_ASCII_CHAR('\''); continue;
6297 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6298 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006299 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006300 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6301 case 't': WRITE_ASCII_CHAR('\t'); continue;
6302 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6303 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006304 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006305 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006306 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006307 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308
Benjamin Peterson29060642009-01-31 22:14:21 +00006309 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310 case '0': case '1': case '2': case '3':
6311 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006312 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006313 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006314 ch = (ch<<3) + *s++ - '0';
6315 if (s < end && '0' <= *s && *s <= '7') {
6316 ch = (ch<<3) + *s++ - '0';
6317 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006319 WRITE_CHAR(ch);
6320 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321
Benjamin Peterson29060642009-01-31 22:14:21 +00006322 /* hex escapes */
6323 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006325 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006326 message = "truncated \\xXX escape";
6327 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328
Benjamin Peterson29060642009-01-31 22:14:21 +00006329 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006331 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006332 message = "truncated \\uXXXX escape";
6333 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334
Benjamin Peterson29060642009-01-31 22:14:21 +00006335 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006336 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006337 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006338 message = "truncated \\UXXXXXXXX escape";
6339 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006340 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006341 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006342 ch <<= 4;
6343 if (c >= '0' && c <= '9') {
6344 ch += c - '0';
6345 }
6346 else if (c >= 'a' && c <= 'f') {
6347 ch += c - ('a' - 10);
6348 }
6349 else if (c >= 'A' && c <= 'F') {
6350 ch += c - ('A' - 10);
6351 }
6352 else {
6353 break;
6354 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006355 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006356 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006357 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006358 }
6359
6360 /* when we get here, ch is a 32-bit unicode character */
6361 if (ch > MAX_UNICODE) {
6362 message = "illegal Unicode character";
6363 goto error;
6364 }
6365
6366 WRITE_CHAR(ch);
6367 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006368
Benjamin Peterson29060642009-01-31 22:14:21 +00006369 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006370 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006371 if (ucnhash_CAPI == NULL) {
6372 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006373 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6374 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006375 if (ucnhash_CAPI == NULL) {
6376 PyErr_SetString(
6377 PyExc_UnicodeError,
6378 "\\N escapes not supported (can't load unicodedata module)"
6379 );
6380 goto onError;
6381 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006382 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006383
6384 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006385 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006386 const char *start = ++s;
6387 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006388 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006389 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006390 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006391 namelen = s - start;
6392 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006393 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006394 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006395 ch = 0xffffffff; /* in case 'getcode' messes up */
6396 if (namelen <= INT_MAX &&
6397 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6398 &ch, 0)) {
6399 assert(ch <= MAX_UNICODE);
6400 WRITE_CHAR(ch);
6401 continue;
6402 }
6403 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006404 }
6405 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006406 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006407
6408 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006409 if (*first_invalid_escape == NULL) {
6410 *first_invalid_escape = s-1; /* Back up one char, since we've
6411 already incremented s. */
6412 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006413 WRITE_ASCII_CHAR('\\');
6414 WRITE_CHAR(c);
6415 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006417
6418 error:
6419 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006420 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006421 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006422 errors, &errorHandler,
6423 "unicodeescape", message,
6424 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006425 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006426 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006427 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006428 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006429
6430#undef WRITE_ASCII_CHAR
6431#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006433
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006434 Py_XDECREF(errorHandler);
6435 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006436 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006437
Benjamin Peterson29060642009-01-31 22:14:21 +00006438 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006439 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006440 Py_XDECREF(errorHandler);
6441 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442 return NULL;
6443}
6444
Eric V. Smith42454af2016-10-31 09:22:08 -04006445PyObject *
6446PyUnicode_DecodeUnicodeEscape(const char *s,
6447 Py_ssize_t size,
6448 const char *errors)
6449{
6450 const char *first_invalid_escape;
6451 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6452 &first_invalid_escape);
6453 if (result == NULL)
6454 return NULL;
6455 if (first_invalid_escape != NULL) {
6456 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6457 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006458 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006459 Py_DECREF(result);
6460 return NULL;
6461 }
6462 }
6463 return result;
6464}
6465
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006466/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467
Alexander Belopolsky40018472011-02-26 01:02:56 +00006468PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006469PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006471 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006472 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006474 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006475 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006476 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477
Ezio Melottie7f90372012-10-05 03:33:31 +03006478 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006479 escape.
6480
Ezio Melottie7f90372012-10-05 03:33:31 +03006481 For UCS1 strings it's '\xxx', 4 bytes per source character.
6482 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6483 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006484 */
6485
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006486 if (!PyUnicode_Check(unicode)) {
6487 PyErr_BadArgument();
6488 return NULL;
6489 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006490 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006491 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006492 }
Victor Stinner358af132015-10-12 22:36:57 +02006493
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006494 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006495 if (len == 0) {
6496 return PyBytes_FromStringAndSize(NULL, 0);
6497 }
6498
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006499 kind = PyUnicode_KIND(unicode);
6500 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006501 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6502 bytes, and 1 byte characters 4. */
6503 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006504 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006505 return PyErr_NoMemory();
6506 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006507 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006508 if (repr == NULL) {
6509 return NULL;
6510 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006511
Victor Stinner62ec3312016-09-06 17:04:34 -07006512 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006513 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006514 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006515
Victor Stinner62ec3312016-09-06 17:04:34 -07006516 /* U+0000-U+00ff range */
6517 if (ch < 0x100) {
6518 if (ch >= ' ' && ch < 127) {
6519 if (ch != '\\') {
6520 /* Copy printable US ASCII as-is */
6521 *p++ = (char) ch;
6522 }
6523 /* Escape backslashes */
6524 else {
6525 *p++ = '\\';
6526 *p++ = '\\';
6527 }
6528 }
Victor Stinner358af132015-10-12 22:36:57 +02006529
Victor Stinner62ec3312016-09-06 17:04:34 -07006530 /* Map special whitespace to '\t', \n', '\r' */
6531 else if (ch == '\t') {
6532 *p++ = '\\';
6533 *p++ = 't';
6534 }
6535 else if (ch == '\n') {
6536 *p++ = '\\';
6537 *p++ = 'n';
6538 }
6539 else if (ch == '\r') {
6540 *p++ = '\\';
6541 *p++ = 'r';
6542 }
6543
6544 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6545 else {
6546 *p++ = '\\';
6547 *p++ = 'x';
6548 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6549 *p++ = Py_hexdigits[ch & 0x000F];
6550 }
Tim Petersced69f82003-09-16 20:30:58 +00006551 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006552 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006553 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554 *p++ = '\\';
6555 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006556 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6557 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6558 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6559 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006561 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6562 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006563
Victor Stinner62ec3312016-09-06 17:04:34 -07006564 /* Make sure that the first two digits are zero */
6565 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006566 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006567 *p++ = 'U';
6568 *p++ = '0';
6569 *p++ = '0';
6570 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6571 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6572 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6573 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6574 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6575 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006576 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578
Victor Stinner62ec3312016-09-06 17:04:34 -07006579 assert(p - PyBytes_AS_STRING(repr) > 0);
6580 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6581 return NULL;
6582 }
6583 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584}
6585
Alexander Belopolsky40018472011-02-26 01:02:56 +00006586PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006587PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6588 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006590 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006591 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006592 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006594 }
6595
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006596 result = PyUnicode_AsUnicodeEscapeString(tmp);
6597 Py_DECREF(tmp);
6598 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599}
6600
6601/* --- Raw Unicode Escape Codec ------------------------------------------- */
6602
Alexander Belopolsky40018472011-02-26 01:02:56 +00006603PyObject *
6604PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006605 Py_ssize_t size,
6606 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006608 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006609 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006611 PyObject *errorHandler = NULL;
6612 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006613
Victor Stinner62ec3312016-09-06 17:04:34 -07006614 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006615 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006616 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006617
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618 /* Escaped strings will always be longer than the resulting
6619 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006620 length after conversion to the true value. (But decoding error
6621 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006622 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006623 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006624 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6625 goto onError;
6626 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006627
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628 end = s + size;
6629 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006630 unsigned char c = (unsigned char) *s++;
6631 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006632 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006633 Py_ssize_t startinpos;
6634 Py_ssize_t endinpos;
6635 const char *message;
6636
6637#define WRITE_CHAR(ch) \
6638 do { \
6639 if (ch <= writer.maxchar) { \
6640 assert(writer.pos < writer.size); \
6641 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6642 } \
6643 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6644 goto onError; \
6645 } \
6646 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647
Benjamin Peterson29060642009-01-31 22:14:21 +00006648 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006649 if (c != '\\' || s >= end) {
6650 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006651 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006652 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006653
Victor Stinner62ec3312016-09-06 17:04:34 -07006654 c = (unsigned char) *s++;
6655 if (c == 'u') {
6656 count = 4;
6657 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006658 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006659 else if (c == 'U') {
6660 count = 8;
6661 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006662 }
6663 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006664 assert(writer.pos < writer.size);
6665 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6666 WRITE_CHAR(c);
6667 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006668 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006669 startinpos = s - starts - 2;
6670
6671 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6672 for (ch = 0; count && s < end; ++s, --count) {
6673 c = (unsigned char)*s;
6674 ch <<= 4;
6675 if (c >= '0' && c <= '9') {
6676 ch += c - '0';
6677 }
6678 else if (c >= 'a' && c <= 'f') {
6679 ch += c - ('a' - 10);
6680 }
6681 else if (c >= 'A' && c <= 'F') {
6682 ch += c - ('A' - 10);
6683 }
6684 else {
6685 break;
6686 }
6687 }
6688 if (!count) {
6689 if (ch <= MAX_UNICODE) {
6690 WRITE_CHAR(ch);
6691 continue;
6692 }
6693 message = "\\Uxxxxxxxx out of range";
6694 }
6695
6696 endinpos = s-starts;
6697 writer.min_length = end - s + writer.pos;
6698 if (unicode_decode_call_errorhandler_writer(
6699 errors, &errorHandler,
6700 "rawunicodeescape", message,
6701 &starts, &end, &startinpos, &endinpos, &exc, &s,
6702 &writer)) {
6703 goto onError;
6704 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006705 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006706
6707#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006709 Py_XDECREF(errorHandler);
6710 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006711 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006712
Benjamin Peterson29060642009-01-31 22:14:21 +00006713 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006714 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006715 Py_XDECREF(errorHandler);
6716 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006718
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719}
6720
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006721
Alexander Belopolsky40018472011-02-26 01:02:56 +00006722PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006723PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724{
Victor Stinner62ec3312016-09-06 17:04:34 -07006725 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006727 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006728 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006729 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006730 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006732 if (!PyUnicode_Check(unicode)) {
6733 PyErr_BadArgument();
6734 return NULL;
6735 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006736 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006737 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006738 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006739 kind = PyUnicode_KIND(unicode);
6740 data = PyUnicode_DATA(unicode);
6741 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006742 if (kind == PyUnicode_1BYTE_KIND) {
6743 return PyBytes_FromStringAndSize(data, len);
6744 }
Victor Stinner0e368262011-11-10 20:12:49 +01006745
Victor Stinner62ec3312016-09-06 17:04:34 -07006746 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6747 bytes, and 1 byte characters 4. */
6748 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006749
Victor Stinner62ec3312016-09-06 17:04:34 -07006750 if (len > PY_SSIZE_T_MAX / expandsize) {
6751 return PyErr_NoMemory();
6752 }
6753 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6754 if (repr == NULL) {
6755 return NULL;
6756 }
6757 if (len == 0) {
6758 return repr;
6759 }
6760
6761 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006762 for (pos = 0; pos < len; pos++) {
6763 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006764
Victor Stinner62ec3312016-09-06 17:04:34 -07006765 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6766 if (ch < 0x100) {
6767 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006768 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006769 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006770 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 *p++ = '\\';
6772 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006773 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6774 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6775 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6776 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006778 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6779 else {
6780 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6781 *p++ = '\\';
6782 *p++ = 'U';
6783 *p++ = '0';
6784 *p++ = '0';
6785 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6786 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6787 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6788 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6789 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6790 *p++ = Py_hexdigits[ch & 15];
6791 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006793
Victor Stinner62ec3312016-09-06 17:04:34 -07006794 assert(p > PyBytes_AS_STRING(repr));
6795 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6796 return NULL;
6797 }
6798 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799}
6800
Alexander Belopolsky40018472011-02-26 01:02:56 +00006801PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006802PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6803 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006805 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006806 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006807 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006808 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006809 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6810 Py_DECREF(tmp);
6811 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812}
6813
6814/* --- Latin-1 Codec ------------------------------------------------------ */
6815
Alexander Belopolsky40018472011-02-26 01:02:56 +00006816PyObject *
6817PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006818 Py_ssize_t size,
6819 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006822 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823}
6824
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006825/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006826static void
6827make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006828 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006829 PyObject *unicode,
6830 Py_ssize_t startpos, Py_ssize_t endpos,
6831 const char *reason)
6832{
6833 if (*exceptionObject == NULL) {
6834 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006835 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006836 encoding, unicode, startpos, endpos, reason);
6837 }
6838 else {
6839 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6840 goto onError;
6841 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6842 goto onError;
6843 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6844 goto onError;
6845 return;
6846 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006847 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006848 }
6849}
6850
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006851/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006852static void
6853raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006854 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006855 PyObject *unicode,
6856 Py_ssize_t startpos, Py_ssize_t endpos,
6857 const char *reason)
6858{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006859 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006860 encoding, unicode, startpos, endpos, reason);
6861 if (*exceptionObject != NULL)
6862 PyCodec_StrictErrors(*exceptionObject);
6863}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006864
6865/* error handling callback helper:
6866 build arguments, call the callback and check the arguments,
6867 put the result into newpos and return the replacement string, which
6868 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006869static PyObject *
6870unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006871 PyObject **errorHandler,
6872 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006873 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006874 Py_ssize_t startpos, Py_ssize_t endpos,
6875 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006876{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006877 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006878 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006879 PyObject *restuple;
6880 PyObject *resunicode;
6881
6882 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006884 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006885 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006886 }
6887
Benjamin Petersonbac79492012-01-14 13:34:47 -05006888 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006889 return NULL;
6890 len = PyUnicode_GET_LENGTH(unicode);
6891
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006892 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006893 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006894 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006895 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006896
Petr Viktorinffd97532020-02-11 17:46:57 +01006897 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006898 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006899 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006900 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006901 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006902 Py_DECREF(restuple);
6903 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006904 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006905 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006906 &resunicode, newpos)) {
6907 Py_DECREF(restuple);
6908 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006909 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006910 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6911 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6912 Py_DECREF(restuple);
6913 return NULL;
6914 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006915 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006916 *newpos = len + *newpos;
6917 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006918 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006919 Py_DECREF(restuple);
6920 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006921 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006922 Py_INCREF(resunicode);
6923 Py_DECREF(restuple);
6924 return resunicode;
6925}
6926
Alexander Belopolsky40018472011-02-26 01:02:56 +00006927static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006928unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006929 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006930 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006931{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006932 /* input state */
6933 Py_ssize_t pos=0, size;
6934 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006935 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006936 /* pointer into the output */
6937 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006938 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6939 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006940 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006941 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006942 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006943 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006944 /* output object */
6945 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006946
Benjamin Petersonbac79492012-01-14 13:34:47 -05006947 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006948 return NULL;
6949 size = PyUnicode_GET_LENGTH(unicode);
6950 kind = PyUnicode_KIND(unicode);
6951 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006952 /* allocate enough for a simple encoding without
6953 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006954 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006955 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006956
6957 _PyBytesWriter_Init(&writer);
6958 str = _PyBytesWriter_Alloc(&writer, size);
6959 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006960 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006961
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006962 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006963 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006964
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006966 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006967 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006968 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006969 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006970 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006971 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006972 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006974 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006975 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006976 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006977
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006978 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006979 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006980
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006981 /* Only overallocate the buffer if it's not the last write */
6982 writer.overallocate = (collend < size);
6983
Benjamin Peterson29060642009-01-31 22:14:21 +00006984 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006985 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006986 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006987
6988 switch (error_handler) {
6989 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006990 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006991 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006992
6993 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006994 memset(str, '?', collend - collstart);
6995 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006996 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006997 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006998 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006999 break;
Victor Stinner50149202015-09-22 00:26:54 +02007000
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007001 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007002 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007003 writer.min_size -= (collend - collstart);
7004 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007005 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007006 if (str == NULL)
7007 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007008 pos = collend;
7009 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007010
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007011 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007012 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007013 writer.min_size -= (collend - collstart);
7014 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007015 unicode, collstart, collend);
7016 if (str == NULL)
7017 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007018 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007019 break;
Victor Stinner50149202015-09-22 00:26:54 +02007020
Victor Stinnerc3713e92015-09-29 12:32:13 +02007021 case _Py_ERROR_SURROGATEESCAPE:
7022 for (i = collstart; i < collend; ++i) {
7023 ch = PyUnicode_READ(kind, data, i);
7024 if (ch < 0xdc80 || 0xdcff < ch) {
7025 /* Not a UTF-8b surrogate */
7026 break;
7027 }
7028 *str++ = (char)(ch - 0xdc00);
7029 ++pos;
7030 }
7031 if (i >= collend)
7032 break;
7033 collstart = pos;
7034 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02007035 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02007036
Benjamin Peterson29060642009-01-31 22:14:21 +00007037 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007038 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7039 encoding, reason, unicode, &exc,
7040 collstart, collend, &newpos);
7041 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007042 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007043
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007044 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007045 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007046
Victor Stinner6bd525b2015-10-09 13:10:05 +02007047 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007048 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007049 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007050 PyBytes_AS_STRING(rep),
7051 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007052 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007053 else {
7054 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007055
Victor Stinner6bd525b2015-10-09 13:10:05 +02007056 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007057 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007058
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007059 if (limit == 256 ?
7060 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7061 !PyUnicode_IS_ASCII(rep))
7062 {
7063 /* Not all characters are smaller than limit */
7064 raise_encode_exception(&exc, encoding, unicode,
7065 collstart, collend, reason);
7066 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007067 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007068 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7069 str = _PyBytesWriter_WriteBytes(&writer, str,
7070 PyUnicode_DATA(rep),
7071 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007072 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007073 if (str == NULL)
7074 goto onError;
7075
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007076 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007077 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007078 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007079
7080 /* If overallocation was disabled, ensure that it was the last
7081 write. Otherwise, we missed an optimization */
7082 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007083 }
7084 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007085
Victor Stinner50149202015-09-22 00:26:54 +02007086 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007087 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007088 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007089
7090 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007091 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007092 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007093 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007094 Py_XDECREF(exc);
7095 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007096}
7097
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007098/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007099PyObject *
7100PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007101 Py_ssize_t size,
7102 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007103{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007104 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007105 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007106 if (unicode == NULL)
7107 return NULL;
7108 result = unicode_encode_ucs1(unicode, errors, 256);
7109 Py_DECREF(unicode);
7110 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111}
7112
Alexander Belopolsky40018472011-02-26 01:02:56 +00007113PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007114_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007115{
7116 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007117 PyErr_BadArgument();
7118 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007119 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007120 if (PyUnicode_READY(unicode) == -1)
7121 return NULL;
7122 /* Fast path: if it is a one-byte string, construct
7123 bytes object directly. */
7124 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7125 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7126 PyUnicode_GET_LENGTH(unicode));
7127 /* Non-Latin-1 characters present. Defer to above function to
7128 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007129 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007130}
7131
7132PyObject*
7133PyUnicode_AsLatin1String(PyObject *unicode)
7134{
7135 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136}
7137
7138/* --- 7-bit ASCII Codec -------------------------------------------------- */
7139
Alexander Belopolsky40018472011-02-26 01:02:56 +00007140PyObject *
7141PyUnicode_DecodeASCII(const char *s,
7142 Py_ssize_t size,
7143 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007144{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007145 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007146 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007147 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007148 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007149 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007150
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007152 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007153
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02007155 if (size == 1 && (unsigned char)s[0] < 128)
7156 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007157
Inada Naoki770847a2019-06-24 12:30:24 +09007158 // Shortcut for simple case
7159 PyObject *u = PyUnicode_New(size, 127);
7160 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007161 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007162 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007163 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007164 if (outpos == size) {
7165 return u;
7166 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007167
Inada Naoki770847a2019-06-24 12:30:24 +09007168 _PyUnicodeWriter writer;
7169 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007170 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007171
Inada Naoki770847a2019-06-24 12:30:24 +09007172 s += outpos;
7173 int kind = writer.kind;
7174 void *data = writer.data;
7175 Py_ssize_t startinpos, endinpos;
7176
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007177 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007178 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007179 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007180 PyUnicode_WRITE(kind, data, writer.pos, c);
7181 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007182 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007183 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007184 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007185
7186 /* byte outsize range 0x00..0x7f: call the error handler */
7187
7188 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007189 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007190
7191 switch (error_handler)
7192 {
7193 case _Py_ERROR_REPLACE:
7194 case _Py_ERROR_SURROGATEESCAPE:
7195 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007196 but we may switch to UCS2 at the first write */
7197 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7198 goto onError;
7199 kind = writer.kind;
7200 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007201
7202 if (error_handler == _Py_ERROR_REPLACE)
7203 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7204 else
7205 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7206 writer.pos++;
7207 ++s;
7208 break;
7209
7210 case _Py_ERROR_IGNORE:
7211 ++s;
7212 break;
7213
7214 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007215 startinpos = s-starts;
7216 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007217 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007218 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007219 "ascii", "ordinal not in range(128)",
7220 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007221 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007222 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007223 kind = writer.kind;
7224 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007225 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007227 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007228 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007229 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007230
Benjamin Peterson29060642009-01-31 22:14:21 +00007231 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007232 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007233 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007234 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235 return NULL;
7236}
7237
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007238/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007239PyObject *
7240PyUnicode_EncodeASCII(const Py_UNICODE *p,
7241 Py_ssize_t size,
7242 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007244 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007245 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007246 if (unicode == NULL)
7247 return NULL;
7248 result = unicode_encode_ucs1(unicode, errors, 128);
7249 Py_DECREF(unicode);
7250 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251}
7252
Alexander Belopolsky40018472011-02-26 01:02:56 +00007253PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007254_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255{
7256 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007257 PyErr_BadArgument();
7258 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007260 if (PyUnicode_READY(unicode) == -1)
7261 return NULL;
7262 /* Fast path: if it is an ASCII-only string, construct bytes object
7263 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007264 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007265 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7266 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007267 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007268}
7269
7270PyObject *
7271PyUnicode_AsASCIIString(PyObject *unicode)
7272{
7273 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274}
7275
Steve Dowercc16be82016-09-08 10:35:16 -07007276#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007277
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007278/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007279
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007280#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007281#define NEED_RETRY
7282#endif
7283
Steve Dower7ebdda02019-08-21 16:22:33 -07007284/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7285 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7286 both cases also and avoids partial characters overrunning the
7287 length limit in MultiByteToWideChar on Windows */
7288#define DECODING_CHUNK_SIZE (INT_MAX/4)
7289
Victor Stinner3a50e702011-10-18 21:21:00 +02007290#ifndef WC_ERR_INVALID_CHARS
7291# define WC_ERR_INVALID_CHARS 0x0080
7292#endif
7293
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007294static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007295code_page_name(UINT code_page, PyObject **obj)
7296{
7297 *obj = NULL;
7298 if (code_page == CP_ACP)
7299 return "mbcs";
7300 if (code_page == CP_UTF7)
7301 return "CP_UTF7";
7302 if (code_page == CP_UTF8)
7303 return "CP_UTF8";
7304
7305 *obj = PyBytes_FromFormat("cp%u", code_page);
7306 if (*obj == NULL)
7307 return NULL;
7308 return PyBytes_AS_STRING(*obj);
7309}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007310
Victor Stinner3a50e702011-10-18 21:21:00 +02007311static DWORD
7312decode_code_page_flags(UINT code_page)
7313{
7314 if (code_page == CP_UTF7) {
7315 /* The CP_UTF7 decoder only supports flags=0 */
7316 return 0;
7317 }
7318 else
7319 return MB_ERR_INVALID_CHARS;
7320}
7321
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007322/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007323 * Decode a byte string from a Windows code page into unicode object in strict
7324 * mode.
7325 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007326 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7327 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007328 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007329static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007330decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007331 wchar_t **buf,
7332 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007333 const char *in,
7334 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007335{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007336 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007337 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007338 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007339
7340 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007341 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007342 while ((outsize = MultiByteToWideChar(code_page, flags,
7343 in, insize, NULL, 0)) <= 0)
7344 {
7345 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7346 goto error;
7347 }
7348 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7349 flags = 0;
7350 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007351
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007352 /* Extend a wchar_t* buffer */
7353 Py_ssize_t n = *bufsize; /* Get the current length */
7354 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7355 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007356 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007357 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007358
7359 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007360 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7361 if (outsize <= 0)
7362 goto error;
7363 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007364
Victor Stinner3a50e702011-10-18 21:21:00 +02007365error:
7366 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7367 return -2;
7368 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007369 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007370}
7371
Victor Stinner3a50e702011-10-18 21:21:00 +02007372/*
7373 * Decode a byte string from a code page into unicode object with an error
7374 * handler.
7375 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007376 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007377 * UnicodeDecodeError exception and returns -1 on error.
7378 */
7379static int
7380decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007381 wchar_t **buf,
7382 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007383 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007384 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007385{
7386 const char *startin = in;
7387 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007388 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007389 /* Ideally, we should get reason from FormatMessage. This is the Windows
7390 2000 English version of the message. */
7391 const char *reason = "No mapping for the Unicode character exists "
7392 "in the target code page.";
7393 /* each step cannot decode more than 1 character, but a character can be
7394 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007395 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007396 int insize;
7397 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007398 PyObject *errorHandler = NULL;
7399 PyObject *exc = NULL;
7400 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007401 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007402 DWORD err;
7403 int ret = -1;
7404
7405 assert(size > 0);
7406
7407 encoding = code_page_name(code_page, &encoding_obj);
7408 if (encoding == NULL)
7409 return -1;
7410
Victor Stinner7d00cc12014-03-17 23:08:06 +01007411 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007412 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7413 UnicodeDecodeError. */
7414 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7415 if (exc != NULL) {
7416 PyCodec_StrictErrors(exc);
7417 Py_CLEAR(exc);
7418 }
7419 goto error;
7420 }
7421
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007422 /* Extend a wchar_t* buffer */
7423 Py_ssize_t n = *bufsize; /* Get the current length */
7424 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7425 PyErr_NoMemory();
7426 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007427 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007428 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7429 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007430 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007431 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007432
7433 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007434 while (in < endin)
7435 {
7436 /* Decode a character */
7437 insize = 1;
7438 do
7439 {
7440 outsize = MultiByteToWideChar(code_page, flags,
7441 in, insize,
7442 buffer, Py_ARRAY_LENGTH(buffer));
7443 if (outsize > 0)
7444 break;
7445 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007446 if (err == ERROR_INVALID_FLAGS && flags) {
7447 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7448 flags = 0;
7449 continue;
7450 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007451 if (err != ERROR_NO_UNICODE_TRANSLATION
7452 && err != ERROR_INSUFFICIENT_BUFFER)
7453 {
7454 PyErr_SetFromWindowsErr(0);
7455 goto error;
7456 }
7457 insize++;
7458 }
7459 /* 4=maximum length of a UTF-8 sequence */
7460 while (insize <= 4 && (in + insize) <= endin);
7461
7462 if (outsize <= 0) {
7463 Py_ssize_t startinpos, endinpos, outpos;
7464
Victor Stinner7d00cc12014-03-17 23:08:06 +01007465 /* last character in partial decode? */
7466 if (in + insize >= endin && !final)
7467 break;
7468
Victor Stinner3a50e702011-10-18 21:21:00 +02007469 startinpos = in - startin;
7470 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007471 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007472 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007473 errors, &errorHandler,
7474 encoding, reason,
7475 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007476 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007477 {
7478 goto error;
7479 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007480 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007481 }
7482 else {
7483 in += insize;
7484 memcpy(out, buffer, outsize * sizeof(wchar_t));
7485 out += outsize;
7486 }
7487 }
7488
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007489 /* Shrink the buffer */
7490 assert(out - *buf <= *bufsize);
7491 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007492 /* (in - startin) <= size and size is an int */
7493 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007494
7495error:
7496 Py_XDECREF(encoding_obj);
7497 Py_XDECREF(errorHandler);
7498 Py_XDECREF(exc);
7499 return ret;
7500}
7501
Victor Stinner3a50e702011-10-18 21:21:00 +02007502static PyObject *
7503decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007504 const char *s, Py_ssize_t size,
7505 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007506{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007507 wchar_t *buf = NULL;
7508 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007509 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007510
Victor Stinner3a50e702011-10-18 21:21:00 +02007511 if (code_page < 0) {
7512 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7513 return NULL;
7514 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007515 if (size < 0) {
7516 PyErr_BadInternalCall();
7517 return NULL;
7518 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007519
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007520 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007521 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007522
Victor Stinner76a31a62011-11-04 00:05:13 +01007523 do
7524 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007525#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007526 if (size > DECODING_CHUNK_SIZE) {
7527 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007528 final = 0;
7529 done = 0;
7530 }
7531 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007532#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007533 {
7534 chunk_size = (int)size;
7535 final = (consumed == NULL);
7536 done = 1;
7537 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007538
Victor Stinner76a31a62011-11-04 00:05:13 +01007539 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007540 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007541 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007542 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007543 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007544
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007545 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007546 s, chunk_size);
7547 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007548 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007549 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007550 errors, final);
7551 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007552
7553 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007554 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007555 return NULL;
7556 }
7557
7558 if (consumed)
7559 *consumed += converted;
7560
7561 s += converted;
7562 size -= converted;
7563 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007564
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007565 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7566 PyMem_Free(buf);
7567 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007568}
7569
Alexander Belopolsky40018472011-02-26 01:02:56 +00007570PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007571PyUnicode_DecodeCodePageStateful(int code_page,
7572 const char *s,
7573 Py_ssize_t size,
7574 const char *errors,
7575 Py_ssize_t *consumed)
7576{
7577 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7578}
7579
7580PyObject *
7581PyUnicode_DecodeMBCSStateful(const char *s,
7582 Py_ssize_t size,
7583 const char *errors,
7584 Py_ssize_t *consumed)
7585{
7586 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7587}
7588
7589PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007590PyUnicode_DecodeMBCS(const char *s,
7591 Py_ssize_t size,
7592 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007593{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007594 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7595}
7596
Victor Stinner3a50e702011-10-18 21:21:00 +02007597static DWORD
7598encode_code_page_flags(UINT code_page, const char *errors)
7599{
7600 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007601 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007602 }
7603 else if (code_page == CP_UTF7) {
7604 /* CP_UTF7 only supports flags=0 */
7605 return 0;
7606 }
7607 else {
7608 if (errors != NULL && strcmp(errors, "replace") == 0)
7609 return 0;
7610 else
7611 return WC_NO_BEST_FIT_CHARS;
7612 }
7613}
7614
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007615/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007616 * Encode a Unicode string to a Windows code page into a byte string in strict
7617 * mode.
7618 *
7619 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007620 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007621 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007622static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007623encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007624 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007625 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007626{
Victor Stinner554f3f02010-06-16 23:33:54 +00007627 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007628 BOOL *pusedDefaultChar = &usedDefaultChar;
7629 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007630 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007631 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007632 const DWORD flags = encode_code_page_flags(code_page, NULL);
7633 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007634 /* Create a substring so that we can get the UTF-16 representation
7635 of just the slice under consideration. */
7636 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007637
Martin v. Löwis3d325192011-11-04 18:23:06 +01007638 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007639
Victor Stinner3a50e702011-10-18 21:21:00 +02007640 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007641 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007642 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007643 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007644
Victor Stinner2fc507f2011-11-04 20:06:39 +01007645 substring = PyUnicode_Substring(unicode, offset, offset+len);
7646 if (substring == NULL)
7647 return -1;
7648 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7649 if (p == NULL) {
7650 Py_DECREF(substring);
7651 return -1;
7652 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007653 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007654
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007655 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007656 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007657 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007658 NULL, 0,
7659 NULL, pusedDefaultChar);
7660 if (outsize <= 0)
7661 goto error;
7662 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007663 if (pusedDefaultChar && *pusedDefaultChar) {
7664 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007665 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007666 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007667
Victor Stinner3a50e702011-10-18 21:21:00 +02007668 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007669 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007670 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007671 if (*outbytes == NULL) {
7672 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007673 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007674 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007675 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007676 }
7677 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007678 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007679 const Py_ssize_t n = PyBytes_Size(*outbytes);
7680 if (outsize > PY_SSIZE_T_MAX - n) {
7681 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007682 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007683 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007684 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007685 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7686 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007687 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007688 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007689 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007690 }
7691
7692 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007693 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007694 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007695 out, outsize,
7696 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007697 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007698 if (outsize <= 0)
7699 goto error;
7700 if (pusedDefaultChar && *pusedDefaultChar)
7701 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007702 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007703
Victor Stinner3a50e702011-10-18 21:21:00 +02007704error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007705 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007706 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7707 return -2;
7708 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007709 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007710}
7711
Victor Stinner3a50e702011-10-18 21:21:00 +02007712/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007713 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007714 * error handler.
7715 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007716 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007717 * -1 on other error.
7718 */
7719static int
7720encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007721 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007722 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007723{
Victor Stinner3a50e702011-10-18 21:21:00 +02007724 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007725 Py_ssize_t pos = unicode_offset;
7726 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007727 /* Ideally, we should get reason from FormatMessage. This is the Windows
7728 2000 English version of the message. */
7729 const char *reason = "invalid character";
7730 /* 4=maximum length of a UTF-8 sequence */
7731 char buffer[4];
7732 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7733 Py_ssize_t outsize;
7734 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007735 PyObject *errorHandler = NULL;
7736 PyObject *exc = NULL;
7737 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007738 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007739 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007740 PyObject *rep;
7741 int ret = -1;
7742
7743 assert(insize > 0);
7744
7745 encoding = code_page_name(code_page, &encoding_obj);
7746 if (encoding == NULL)
7747 return -1;
7748
7749 if (errors == NULL || strcmp(errors, "strict") == 0) {
7750 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7751 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007752 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007753 if (exc != NULL) {
7754 PyCodec_StrictErrors(exc);
7755 Py_DECREF(exc);
7756 }
7757 Py_XDECREF(encoding_obj);
7758 return -1;
7759 }
7760
7761 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7762 pusedDefaultChar = &usedDefaultChar;
7763 else
7764 pusedDefaultChar = NULL;
7765
7766 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7767 PyErr_NoMemory();
7768 goto error;
7769 }
7770 outsize = insize * Py_ARRAY_LENGTH(buffer);
7771
7772 if (*outbytes == NULL) {
7773 /* Create string object */
7774 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7775 if (*outbytes == NULL)
7776 goto error;
7777 out = PyBytes_AS_STRING(*outbytes);
7778 }
7779 else {
7780 /* Extend string object */
7781 Py_ssize_t n = PyBytes_Size(*outbytes);
7782 if (n > PY_SSIZE_T_MAX - outsize) {
7783 PyErr_NoMemory();
7784 goto error;
7785 }
7786 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7787 goto error;
7788 out = PyBytes_AS_STRING(*outbytes) + n;
7789 }
7790
7791 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007792 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007793 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007794 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7795 wchar_t chars[2];
7796 int charsize;
7797 if (ch < 0x10000) {
7798 chars[0] = (wchar_t)ch;
7799 charsize = 1;
7800 }
7801 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007802 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7803 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007804 charsize = 2;
7805 }
7806
Victor Stinner3a50e702011-10-18 21:21:00 +02007807 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007808 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007809 buffer, Py_ARRAY_LENGTH(buffer),
7810 NULL, pusedDefaultChar);
7811 if (outsize > 0) {
7812 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7813 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007814 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007815 memcpy(out, buffer, outsize);
7816 out += outsize;
7817 continue;
7818 }
7819 }
7820 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7821 PyErr_SetFromWindowsErr(0);
7822 goto error;
7823 }
7824
Victor Stinner3a50e702011-10-18 21:21:00 +02007825 rep = unicode_encode_call_errorhandler(
7826 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007827 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007828 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007829 if (rep == NULL)
7830 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007831 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007832
7833 if (PyBytes_Check(rep)) {
7834 outsize = PyBytes_GET_SIZE(rep);
7835 if (outsize != 1) {
7836 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7837 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7838 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7839 Py_DECREF(rep);
7840 goto error;
7841 }
7842 out = PyBytes_AS_STRING(*outbytes) + offset;
7843 }
7844 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7845 out += outsize;
7846 }
7847 else {
7848 Py_ssize_t i;
7849 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007850 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02007851
Benjamin Petersonbac79492012-01-14 13:34:47 -05007852 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007853 Py_DECREF(rep);
7854 goto error;
7855 }
7856
7857 outsize = PyUnicode_GET_LENGTH(rep);
7858 if (outsize != 1) {
7859 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7860 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7861 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7862 Py_DECREF(rep);
7863 goto error;
7864 }
7865 out = PyBytes_AS_STRING(*outbytes) + offset;
7866 }
7867 kind = PyUnicode_KIND(rep);
7868 data = PyUnicode_DATA(rep);
7869 for (i=0; i < outsize; i++) {
7870 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7871 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007872 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007873 encoding, unicode,
7874 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007875 "unable to encode error handler result to ASCII");
7876 Py_DECREF(rep);
7877 goto error;
7878 }
7879 *out = (unsigned char)ch;
7880 out++;
7881 }
7882 }
7883 Py_DECREF(rep);
7884 }
7885 /* write a NUL byte */
7886 *out = 0;
7887 outsize = out - PyBytes_AS_STRING(*outbytes);
7888 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7889 if (_PyBytes_Resize(outbytes, outsize) < 0)
7890 goto error;
7891 ret = 0;
7892
7893error:
7894 Py_XDECREF(encoding_obj);
7895 Py_XDECREF(errorHandler);
7896 Py_XDECREF(exc);
7897 return ret;
7898}
7899
Victor Stinner3a50e702011-10-18 21:21:00 +02007900static PyObject *
7901encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007902 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007903 const char *errors)
7904{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007905 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007906 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007907 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007908 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007909
Victor Stinner29dacf22015-01-26 16:41:32 +01007910 if (!PyUnicode_Check(unicode)) {
7911 PyErr_BadArgument();
7912 return NULL;
7913 }
7914
Benjamin Petersonbac79492012-01-14 13:34:47 -05007915 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007916 return NULL;
7917 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007918
Victor Stinner3a50e702011-10-18 21:21:00 +02007919 if (code_page < 0) {
7920 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7921 return NULL;
7922 }
7923
Martin v. Löwis3d325192011-11-04 18:23:06 +01007924 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007925 return PyBytes_FromStringAndSize(NULL, 0);
7926
Victor Stinner7581cef2011-11-03 22:32:33 +01007927 offset = 0;
7928 do
7929 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007930#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007931 if (len > DECODING_CHUNK_SIZE) {
7932 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007933 done = 0;
7934 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007935 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007936#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007937 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007938 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007939 done = 1;
7940 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007941
Victor Stinner76a31a62011-11-04 00:05:13 +01007942 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007943 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007944 errors);
7945 if (ret == -2)
7946 ret = encode_code_page_errors(code_page, &outbytes,
7947 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007948 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007949 if (ret < 0) {
7950 Py_XDECREF(outbytes);
7951 return NULL;
7952 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007953
Victor Stinner7581cef2011-11-03 22:32:33 +01007954 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007955 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007956 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007957
Victor Stinner3a50e702011-10-18 21:21:00 +02007958 return outbytes;
7959}
7960
7961PyObject *
7962PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7963 Py_ssize_t size,
7964 const char *errors)
7965{
Victor Stinner7581cef2011-11-03 22:32:33 +01007966 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007967 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007968 if (unicode == NULL)
7969 return NULL;
7970 res = encode_code_page(CP_ACP, unicode, errors);
7971 Py_DECREF(unicode);
7972 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007973}
7974
7975PyObject *
7976PyUnicode_EncodeCodePage(int code_page,
7977 PyObject *unicode,
7978 const char *errors)
7979{
Victor Stinner7581cef2011-11-03 22:32:33 +01007980 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007981}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007982
Alexander Belopolsky40018472011-02-26 01:02:56 +00007983PyObject *
7984PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007985{
Victor Stinner7581cef2011-11-03 22:32:33 +01007986 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007987}
7988
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007989#undef NEED_RETRY
7990
Steve Dowercc16be82016-09-08 10:35:16 -07007991#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007992
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993/* --- Character Mapping Codec -------------------------------------------- */
7994
Victor Stinnerfb161b12013-04-18 01:44:27 +02007995static int
7996charmap_decode_string(const char *s,
7997 Py_ssize_t size,
7998 PyObject *mapping,
7999 const char *errors,
8000 _PyUnicodeWriter *writer)
8001{
8002 const char *starts = s;
8003 const char *e;
8004 Py_ssize_t startinpos, endinpos;
8005 PyObject *errorHandler = NULL, *exc = NULL;
8006 Py_ssize_t maplen;
8007 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008008 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008009 Py_UCS4 x;
8010 unsigned char ch;
8011
8012 if (PyUnicode_READY(mapping) == -1)
8013 return -1;
8014
8015 maplen = PyUnicode_GET_LENGTH(mapping);
8016 mapdata = PyUnicode_DATA(mapping);
8017 mapkind = PyUnicode_KIND(mapping);
8018
8019 e = s + size;
8020
8021 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8022 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8023 * is disabled in encoding aliases, latin1 is preferred because
8024 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008025 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008026 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8027 Py_UCS4 maxchar = writer->maxchar;
8028
8029 assert (writer->kind == PyUnicode_1BYTE_KIND);
8030 while (s < e) {
8031 ch = *s;
8032 x = mapdata_ucs1[ch];
8033 if (x > maxchar) {
8034 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8035 goto onError;
8036 maxchar = writer->maxchar;
8037 outdata = (Py_UCS1 *)writer->data;
8038 }
8039 outdata[writer->pos] = x;
8040 writer->pos++;
8041 ++s;
8042 }
8043 return 0;
8044 }
8045
8046 while (s < e) {
8047 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8048 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008049 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008050 if (outkind == PyUnicode_1BYTE_KIND) {
8051 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8052 Py_UCS4 maxchar = writer->maxchar;
8053 while (s < e) {
8054 ch = *s;
8055 x = mapdata_ucs2[ch];
8056 if (x > maxchar)
8057 goto Error;
8058 outdata[writer->pos] = x;
8059 writer->pos++;
8060 ++s;
8061 }
8062 break;
8063 }
8064 else if (outkind == PyUnicode_2BYTE_KIND) {
8065 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8066 while (s < e) {
8067 ch = *s;
8068 x = mapdata_ucs2[ch];
8069 if (x == 0xFFFE)
8070 goto Error;
8071 outdata[writer->pos] = x;
8072 writer->pos++;
8073 ++s;
8074 }
8075 break;
8076 }
8077 }
8078 ch = *s;
8079
8080 if (ch < maplen)
8081 x = PyUnicode_READ(mapkind, mapdata, ch);
8082 else
8083 x = 0xfffe; /* invalid value */
8084Error:
8085 if (x == 0xfffe)
8086 {
8087 /* undefined mapping */
8088 startinpos = s-starts;
8089 endinpos = startinpos+1;
8090 if (unicode_decode_call_errorhandler_writer(
8091 errors, &errorHandler,
8092 "charmap", "character maps to <undefined>",
8093 &starts, &e, &startinpos, &endinpos, &exc, &s,
8094 writer)) {
8095 goto onError;
8096 }
8097 continue;
8098 }
8099
8100 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8101 goto onError;
8102 ++s;
8103 }
8104 Py_XDECREF(errorHandler);
8105 Py_XDECREF(exc);
8106 return 0;
8107
8108onError:
8109 Py_XDECREF(errorHandler);
8110 Py_XDECREF(exc);
8111 return -1;
8112}
8113
8114static int
8115charmap_decode_mapping(const char *s,
8116 Py_ssize_t size,
8117 PyObject *mapping,
8118 const char *errors,
8119 _PyUnicodeWriter *writer)
8120{
8121 const char *starts = s;
8122 const char *e;
8123 Py_ssize_t startinpos, endinpos;
8124 PyObject *errorHandler = NULL, *exc = NULL;
8125 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008126 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008127
8128 e = s + size;
8129
8130 while (s < e) {
8131 ch = *s;
8132
8133 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8134 key = PyLong_FromLong((long)ch);
8135 if (key == NULL)
8136 goto onError;
8137
8138 item = PyObject_GetItem(mapping, key);
8139 Py_DECREF(key);
8140 if (item == NULL) {
8141 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8142 /* No mapping found means: mapping is undefined. */
8143 PyErr_Clear();
8144 goto Undefined;
8145 } else
8146 goto onError;
8147 }
8148
8149 /* Apply mapping */
8150 if (item == Py_None)
8151 goto Undefined;
8152 if (PyLong_Check(item)) {
8153 long value = PyLong_AS_LONG(item);
8154 if (value == 0xFFFE)
8155 goto Undefined;
8156 if (value < 0 || value > MAX_UNICODE) {
8157 PyErr_Format(PyExc_TypeError,
8158 "character mapping must be in range(0x%lx)",
8159 (unsigned long)MAX_UNICODE + 1);
8160 goto onError;
8161 }
8162
8163 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8164 goto onError;
8165 }
8166 else if (PyUnicode_Check(item)) {
8167 if (PyUnicode_READY(item) == -1)
8168 goto onError;
8169 if (PyUnicode_GET_LENGTH(item) == 1) {
8170 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8171 if (value == 0xFFFE)
8172 goto Undefined;
8173 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8174 goto onError;
8175 }
8176 else {
8177 writer->overallocate = 1;
8178 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8179 goto onError;
8180 }
8181 }
8182 else {
8183 /* wrong return value */
8184 PyErr_SetString(PyExc_TypeError,
8185 "character mapping must return integer, None or str");
8186 goto onError;
8187 }
8188 Py_CLEAR(item);
8189 ++s;
8190 continue;
8191
8192Undefined:
8193 /* undefined mapping */
8194 Py_CLEAR(item);
8195 startinpos = s-starts;
8196 endinpos = startinpos+1;
8197 if (unicode_decode_call_errorhandler_writer(
8198 errors, &errorHandler,
8199 "charmap", "character maps to <undefined>",
8200 &starts, &e, &startinpos, &endinpos, &exc, &s,
8201 writer)) {
8202 goto onError;
8203 }
8204 }
8205 Py_XDECREF(errorHandler);
8206 Py_XDECREF(exc);
8207 return 0;
8208
8209onError:
8210 Py_XDECREF(item);
8211 Py_XDECREF(errorHandler);
8212 Py_XDECREF(exc);
8213 return -1;
8214}
8215
Alexander Belopolsky40018472011-02-26 01:02:56 +00008216PyObject *
8217PyUnicode_DecodeCharmap(const char *s,
8218 Py_ssize_t size,
8219 PyObject *mapping,
8220 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008221{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008222 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008223
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224 /* Default to Latin-1 */
8225 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008226 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008227
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008229 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008230 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008231 writer.min_length = size;
8232 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008234
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008235 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008236 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8237 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008238 }
8239 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008240 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8241 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008243 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008244
Benjamin Peterson29060642009-01-31 22:14:21 +00008245 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008246 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247 return NULL;
8248}
8249
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008250/* Charmap encoding: the lookup table */
8251
Alexander Belopolsky40018472011-02-26 01:02:56 +00008252struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008253 PyObject_HEAD
8254 unsigned char level1[32];
8255 int count2, count3;
8256 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008257};
8258
8259static PyObject*
8260encoding_map_size(PyObject *obj, PyObject* args)
8261{
8262 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008263 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008264 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008265}
8266
8267static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008268 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008269 PyDoc_STR("Return the size (in bytes) of this object") },
8270 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008271};
8272
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008273static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008274 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008275 "EncodingMap", /*tp_name*/
8276 sizeof(struct encoding_map), /*tp_basicsize*/
8277 0, /*tp_itemsize*/
8278 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008279 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008280 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008281 0, /*tp_getattr*/
8282 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008283 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 0, /*tp_repr*/
8285 0, /*tp_as_number*/
8286 0, /*tp_as_sequence*/
8287 0, /*tp_as_mapping*/
8288 0, /*tp_hash*/
8289 0, /*tp_call*/
8290 0, /*tp_str*/
8291 0, /*tp_getattro*/
8292 0, /*tp_setattro*/
8293 0, /*tp_as_buffer*/
8294 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8295 0, /*tp_doc*/
8296 0, /*tp_traverse*/
8297 0, /*tp_clear*/
8298 0, /*tp_richcompare*/
8299 0, /*tp_weaklistoffset*/
8300 0, /*tp_iter*/
8301 0, /*tp_iternext*/
8302 encoding_map_methods, /*tp_methods*/
8303 0, /*tp_members*/
8304 0, /*tp_getset*/
8305 0, /*tp_base*/
8306 0, /*tp_dict*/
8307 0, /*tp_descr_get*/
8308 0, /*tp_descr_set*/
8309 0, /*tp_dictoffset*/
8310 0, /*tp_init*/
8311 0, /*tp_alloc*/
8312 0, /*tp_new*/
8313 0, /*tp_free*/
8314 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008315};
8316
8317PyObject*
8318PyUnicode_BuildEncodingMap(PyObject* string)
8319{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008320 PyObject *result;
8321 struct encoding_map *mresult;
8322 int i;
8323 int need_dict = 0;
8324 unsigned char level1[32];
8325 unsigned char level2[512];
8326 unsigned char *mlevel1, *mlevel2, *mlevel3;
8327 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008328 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008329 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008330 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008331 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008332
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008333 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008334 PyErr_BadArgument();
8335 return NULL;
8336 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008337 kind = PyUnicode_KIND(string);
8338 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008339 length = PyUnicode_GET_LENGTH(string);
8340 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008341 memset(level1, 0xFF, sizeof level1);
8342 memset(level2, 0xFF, sizeof level2);
8343
8344 /* If there isn't a one-to-one mapping of NULL to \0,
8345 or if there are non-BMP characters, we need to use
8346 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008347 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008348 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008349 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008350 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008351 ch = PyUnicode_READ(kind, data, i);
8352 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008353 need_dict = 1;
8354 break;
8355 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008356 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008357 /* unmapped character */
8358 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008359 l1 = ch >> 11;
8360 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008361 if (level1[l1] == 0xFF)
8362 level1[l1] = count2++;
8363 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008364 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008365 }
8366
8367 if (count2 >= 0xFF || count3 >= 0xFF)
8368 need_dict = 1;
8369
8370 if (need_dict) {
8371 PyObject *result = PyDict_New();
8372 PyObject *key, *value;
8373 if (!result)
8374 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008375 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008376 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008377 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008378 if (!key || !value)
8379 goto failed1;
8380 if (PyDict_SetItem(result, key, value) == -1)
8381 goto failed1;
8382 Py_DECREF(key);
8383 Py_DECREF(value);
8384 }
8385 return result;
8386 failed1:
8387 Py_XDECREF(key);
8388 Py_XDECREF(value);
8389 Py_DECREF(result);
8390 return NULL;
8391 }
8392
8393 /* Create a three-level trie */
8394 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8395 16*count2 + 128*count3 - 1);
8396 if (!result)
8397 return PyErr_NoMemory();
8398 PyObject_Init(result, &EncodingMapType);
8399 mresult = (struct encoding_map*)result;
8400 mresult->count2 = count2;
8401 mresult->count3 = count3;
8402 mlevel1 = mresult->level1;
8403 mlevel2 = mresult->level23;
8404 mlevel3 = mresult->level23 + 16*count2;
8405 memcpy(mlevel1, level1, 32);
8406 memset(mlevel2, 0xFF, 16*count2);
8407 memset(mlevel3, 0, 128*count3);
8408 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008409 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008410 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008411 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8412 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008413 /* unmapped character */
8414 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008415 o1 = ch>>11;
8416 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008417 i2 = 16*mlevel1[o1] + o2;
8418 if (mlevel2[i2] == 0xFF)
8419 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008420 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008421 i3 = 128*mlevel2[i2] + o3;
8422 mlevel3[i3] = i;
8423 }
8424 return result;
8425}
8426
8427static int
Victor Stinner22168992011-11-20 17:09:18 +01008428encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008429{
8430 struct encoding_map *map = (struct encoding_map*)mapping;
8431 int l1 = c>>11;
8432 int l2 = (c>>7) & 0xF;
8433 int l3 = c & 0x7F;
8434 int i;
8435
Victor Stinner22168992011-11-20 17:09:18 +01008436 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008437 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008438 if (c == 0)
8439 return 0;
8440 /* level 1*/
8441 i = map->level1[l1];
8442 if (i == 0xFF) {
8443 return -1;
8444 }
8445 /* level 2*/
8446 i = map->level23[16*i+l2];
8447 if (i == 0xFF) {
8448 return -1;
8449 }
8450 /* level 3 */
8451 i = map->level23[16*map->count2 + 128*i + l3];
8452 if (i == 0) {
8453 return -1;
8454 }
8455 return i;
8456}
8457
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008458/* Lookup the character ch in the mapping. If the character
8459 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008460 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008461static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008462charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008463{
Christian Heimes217cfd12007-12-02 14:31:20 +00008464 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008465 PyObject *x;
8466
8467 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008468 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008469 x = PyObject_GetItem(mapping, w);
8470 Py_DECREF(w);
8471 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8473 /* No mapping found means: mapping is undefined. */
8474 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008475 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008476 } else
8477 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008478 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008479 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008481 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008482 long value = PyLong_AS_LONG(x);
8483 if (value < 0 || value > 255) {
8484 PyErr_SetString(PyExc_TypeError,
8485 "character mapping must be in range(256)");
8486 Py_DECREF(x);
8487 return NULL;
8488 }
8489 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008490 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008491 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008493 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 /* wrong return value */
8495 PyErr_Format(PyExc_TypeError,
8496 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008497 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 Py_DECREF(x);
8499 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500 }
8501}
8502
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008503static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008504charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008505{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008506 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8507 /* exponentially overallocate to minimize reallocations */
8508 if (requiredsize < 2*outsize)
8509 requiredsize = 2*outsize;
8510 if (_PyBytes_Resize(outobj, requiredsize))
8511 return -1;
8512 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008513}
8514
Benjamin Peterson14339b62009-01-31 16:36:08 +00008515typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008517} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008518/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008519 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008520 space is available. Return a new reference to the object that
8521 was put in the output buffer, or Py_None, if the mapping was undefined
8522 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008523 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008524static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008525charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008526 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008527{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008528 PyObject *rep;
8529 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008530 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008531
Andy Lesterdffe4c02020-03-04 07:15:20 -06008532 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008533 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008534 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008535 if (res == -1)
8536 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008537 if (outsize<requiredsize)
8538 if (charmapencode_resize(outobj, outpos, requiredsize))
8539 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008540 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 outstart[(*outpos)++] = (char)res;
8542 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008543 }
8544
8545 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008546 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008547 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008548 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 Py_DECREF(rep);
8550 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008551 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008552 if (PyLong_Check(rep)) {
8553 Py_ssize_t requiredsize = *outpos+1;
8554 if (outsize<requiredsize)
8555 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8556 Py_DECREF(rep);
8557 return enc_EXCEPTION;
8558 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008559 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008560 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008561 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 else {
8563 const char *repchars = PyBytes_AS_STRING(rep);
8564 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8565 Py_ssize_t requiredsize = *outpos+repsize;
8566 if (outsize<requiredsize)
8567 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8568 Py_DECREF(rep);
8569 return enc_EXCEPTION;
8570 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008571 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 memcpy(outstart + *outpos, repchars, repsize);
8573 *outpos += repsize;
8574 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008575 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008576 Py_DECREF(rep);
8577 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008578}
8579
8580/* handle an error in PyUnicode_EncodeCharmap
8581 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008582static int
8583charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008584 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008585 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008586 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008587 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008588{
8589 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008590 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008591 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008592 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008593 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008594 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008595 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008596 Py_ssize_t collstartpos = *inpos;
8597 Py_ssize_t collendpos = *inpos+1;
8598 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008599 const char *encoding = "charmap";
8600 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008601 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008602 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008603 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008604
Benjamin Petersonbac79492012-01-14 13:34:47 -05008605 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008606 return -1;
8607 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008608 /* find all unencodable characters */
8609 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008610 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008611 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008612 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008613 val = encoding_map_lookup(ch, mapping);
8614 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008615 break;
8616 ++collendpos;
8617 continue;
8618 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008619
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008620 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8621 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008622 if (rep==NULL)
8623 return -1;
8624 else if (rep!=Py_None) {
8625 Py_DECREF(rep);
8626 break;
8627 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008628 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008629 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008630 }
8631 /* cache callback name lookup
8632 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008633 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008634 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008635
8636 switch (*error_handler) {
8637 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008638 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008639 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008640
8641 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008642 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008643 x = charmapencode_output('?', mapping, res, respos);
8644 if (x==enc_EXCEPTION) {
8645 return -1;
8646 }
8647 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008648 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 return -1;
8650 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008651 }
8652 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008653 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008654 *inpos = collendpos;
8655 break;
Victor Stinner50149202015-09-22 00:26:54 +02008656
8657 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008658 /* generate replacement (temporarily (mis)uses p) */
8659 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008660 char buffer[2+29+1+1];
8661 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008662 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008663 for (cp = buffer; *cp; ++cp) {
8664 x = charmapencode_output(*cp, mapping, res, respos);
8665 if (x==enc_EXCEPTION)
8666 return -1;
8667 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008668 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 return -1;
8670 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008671 }
8672 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008673 *inpos = collendpos;
8674 break;
Victor Stinner50149202015-09-22 00:26:54 +02008675
Benjamin Peterson14339b62009-01-31 16:36:08 +00008676 default:
Victor Stinner50149202015-09-22 00:26:54 +02008677 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008678 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008680 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008681 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008682 if (PyBytes_Check(repunicode)) {
8683 /* Directly copy bytes result to output. */
8684 Py_ssize_t outsize = PyBytes_Size(*res);
8685 Py_ssize_t requiredsize;
8686 repsize = PyBytes_Size(repunicode);
8687 requiredsize = *respos + repsize;
8688 if (requiredsize > outsize)
8689 /* Make room for all additional bytes. */
8690 if (charmapencode_resize(res, respos, requiredsize)) {
8691 Py_DECREF(repunicode);
8692 return -1;
8693 }
8694 memcpy(PyBytes_AsString(*res) + *respos,
8695 PyBytes_AsString(repunicode), repsize);
8696 *respos += repsize;
8697 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008698 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008699 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008700 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008701 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008702 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008703 Py_DECREF(repunicode);
8704 return -1;
8705 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008706 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008707 data = PyUnicode_DATA(repunicode);
8708 kind = PyUnicode_KIND(repunicode);
8709 for (index = 0; index < repsize; index++) {
8710 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8711 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008713 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 return -1;
8715 }
8716 else if (x==enc_FAILED) {
8717 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008718 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 return -1;
8720 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008721 }
8722 *inpos = newpos;
8723 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008724 }
8725 return 0;
8726}
8727
Alexander Belopolsky40018472011-02-26 01:02:56 +00008728PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008729_PyUnicode_EncodeCharmap(PyObject *unicode,
8730 PyObject *mapping,
8731 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008732{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008733 /* output object */
8734 PyObject *res = NULL;
8735 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008736 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008737 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008738 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008739 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008740 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008741 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008742 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008743 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008744 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008745
Benjamin Petersonbac79492012-01-14 13:34:47 -05008746 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008747 return NULL;
8748 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008749 data = PyUnicode_DATA(unicode);
8750 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008751
Guido van Rossumd57fd912000-03-10 22:53:23 +00008752 /* Default to Latin-1 */
8753 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008754 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008755
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008756 /* allocate enough for a simple encoding without
8757 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008758 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008759 if (res == NULL)
8760 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008761 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008762 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008763
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008764 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008765 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008766 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008767 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008768 if (x==enc_EXCEPTION) /* error */
8769 goto onError;
8770 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008771 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008772 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008773 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008774 &res, &respos)) {
8775 goto onError;
8776 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008777 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008778 else
8779 /* done with this character => adjust input position */
8780 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008781 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008783 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008784 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008785 if (_PyBytes_Resize(&res, respos) < 0)
8786 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008787
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008788 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008789 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008790 return res;
8791
Benjamin Peterson29060642009-01-31 22:14:21 +00008792 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008793 Py_XDECREF(res);
8794 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008795 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008796 return NULL;
8797}
8798
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008799/* Deprecated */
8800PyObject *
8801PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8802 Py_ssize_t size,
8803 PyObject *mapping,
8804 const char *errors)
8805{
8806 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008807 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008808 if (unicode == NULL)
8809 return NULL;
8810 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8811 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008812 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008813}
8814
Alexander Belopolsky40018472011-02-26 01:02:56 +00008815PyObject *
8816PyUnicode_AsCharmapString(PyObject *unicode,
8817 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008818{
8819 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008820 PyErr_BadArgument();
8821 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008822 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008823 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008824}
8825
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008826/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008827static void
8828make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008829 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008830 Py_ssize_t startpos, Py_ssize_t endpos,
8831 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008832{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008833 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008834 *exceptionObject = _PyUnicodeTranslateError_Create(
8835 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008836 }
8837 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008838 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8839 goto onError;
8840 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8841 goto onError;
8842 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8843 goto onError;
8844 return;
8845 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008846 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847 }
8848}
8849
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008850/* error handling callback helper:
8851 build arguments, call the callback and check the arguments,
8852 put the result into newpos and return the replacement string, which
8853 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008854static PyObject *
8855unicode_translate_call_errorhandler(const char *errors,
8856 PyObject **errorHandler,
8857 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008858 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008859 Py_ssize_t startpos, Py_ssize_t endpos,
8860 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008861{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008862 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008863
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008864 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008865 PyObject *restuple;
8866 PyObject *resunicode;
8867
8868 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008869 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008870 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008871 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008872 }
8873
8874 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008875 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008876 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008877 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008878
Petr Viktorinffd97532020-02-11 17:46:57 +01008879 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008880 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008881 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008882 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008883 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008884 Py_DECREF(restuple);
8885 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008886 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008887 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008888 &resunicode, &i_newpos)) {
8889 Py_DECREF(restuple);
8890 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008891 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008892 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008893 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008894 else
8895 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008896 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008897 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008898 Py_DECREF(restuple);
8899 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008900 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008901 Py_INCREF(resunicode);
8902 Py_DECREF(restuple);
8903 return resunicode;
8904}
8905
8906/* Lookup the character ch in the mapping and put the result in result,
8907 which must be decrefed by the caller.
8908 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008909static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008910charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008911{
Christian Heimes217cfd12007-12-02 14:31:20 +00008912 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008913 PyObject *x;
8914
8915 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008916 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008917 x = PyObject_GetItem(mapping, w);
8918 Py_DECREF(w);
8919 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008920 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8921 /* No mapping found means: use 1:1 mapping. */
8922 PyErr_Clear();
8923 *result = NULL;
8924 return 0;
8925 } else
8926 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008927 }
8928 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008929 *result = x;
8930 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008931 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008932 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008933 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008934 if (value < 0 || value > MAX_UNICODE) {
8935 PyErr_Format(PyExc_ValueError,
8936 "character mapping must be in range(0x%x)",
8937 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008938 Py_DECREF(x);
8939 return -1;
8940 }
8941 *result = x;
8942 return 0;
8943 }
8944 else if (PyUnicode_Check(x)) {
8945 *result = x;
8946 return 0;
8947 }
8948 else {
8949 /* wrong return value */
8950 PyErr_SetString(PyExc_TypeError,
8951 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008952 Py_DECREF(x);
8953 return -1;
8954 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008955}
Victor Stinner1194ea02014-04-04 19:37:40 +02008956
8957/* lookup the character, write the result into the writer.
8958 Return 1 if the result was written into the writer, return 0 if the mapping
8959 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008960static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008961charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8962 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008963{
Victor Stinner1194ea02014-04-04 19:37:40 +02008964 PyObject *item;
8965
8966 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008967 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008968
8969 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008970 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008971 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008972 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008973 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008974 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008975 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008976
8977 if (item == Py_None) {
8978 Py_DECREF(item);
8979 return 0;
8980 }
8981
8982 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008983 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8984 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8985 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008986 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8987 Py_DECREF(item);
8988 return -1;
8989 }
8990 Py_DECREF(item);
8991 return 1;
8992 }
8993
8994 if (!PyUnicode_Check(item)) {
8995 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008996 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008997 }
8998
8999 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9000 Py_DECREF(item);
9001 return -1;
9002 }
9003
9004 Py_DECREF(item);
9005 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009006}
9007
Victor Stinner89a76ab2014-04-05 11:44:04 +02009008static int
9009unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9010 Py_UCS1 *translate)
9011{
Benjamin Peterson1365de72014-04-07 20:15:41 -04009012 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009013 int ret = 0;
9014
Victor Stinner89a76ab2014-04-05 11:44:04 +02009015 if (charmaptranslate_lookup(ch, mapping, &item)) {
9016 return -1;
9017 }
9018
9019 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009020 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02009021 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009022 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009023 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009024 /* not found => default to 1:1 mapping */
9025 translate[ch] = ch;
9026 return 1;
9027 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009028 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02009029 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009030 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9031 used it */
9032 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009033 /* invalid character or character outside ASCII:
9034 skip the fast translate */
9035 goto exit;
9036 }
9037 translate[ch] = (Py_UCS1)replace;
9038 }
9039 else if (PyUnicode_Check(item)) {
9040 Py_UCS4 replace;
9041
9042 if (PyUnicode_READY(item) == -1) {
9043 Py_DECREF(item);
9044 return -1;
9045 }
9046 if (PyUnicode_GET_LENGTH(item) != 1)
9047 goto exit;
9048
9049 replace = PyUnicode_READ_CHAR(item, 0);
9050 if (replace > 127)
9051 goto exit;
9052 translate[ch] = (Py_UCS1)replace;
9053 }
9054 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009055 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009056 goto exit;
9057 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009058 ret = 1;
9059
Benjamin Peterson1365de72014-04-07 20:15:41 -04009060 exit:
9061 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009062 return ret;
9063}
9064
9065/* Fast path for ascii => ascii translation. Return 1 if the whole string
9066 was translated into writer, return 0 if the input string was partially
9067 translated into writer, raise an exception and return -1 on error. */
9068static int
9069unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009070 _PyUnicodeWriter *writer, int ignore,
9071 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009072{
Victor Stinner872b2912014-04-05 14:27:07 +02009073 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009074 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009075 const Py_UCS1 *in, *end;
9076 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009077 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009078
Victor Stinner89a76ab2014-04-05 11:44:04 +02009079 len = PyUnicode_GET_LENGTH(input);
9080
Victor Stinner872b2912014-04-05 14:27:07 +02009081 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009082
9083 in = PyUnicode_1BYTE_DATA(input);
9084 end = in + len;
9085
9086 assert(PyUnicode_IS_ASCII(writer->buffer));
9087 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9088 out = PyUnicode_1BYTE_DATA(writer->buffer);
9089
Victor Stinner872b2912014-04-05 14:27:07 +02009090 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009091 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009092 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009093 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009094 int translate = unicode_fast_translate_lookup(mapping, ch,
9095 ascii_table);
9096 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009097 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009098 if (translate == 0)
9099 goto exit;
9100 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009101 }
Victor Stinner872b2912014-04-05 14:27:07 +02009102 if (ch2 == 0xfe) {
9103 if (ignore)
9104 continue;
9105 goto exit;
9106 }
9107 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009108 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009109 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009110 }
Victor Stinner872b2912014-04-05 14:27:07 +02009111 res = 1;
9112
9113exit:
9114 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009115 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009116 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009117}
9118
Victor Stinner3222da22015-10-01 22:07:32 +02009119static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009120_PyUnicode_TranslateCharmap(PyObject *input,
9121 PyObject *mapping,
9122 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009123{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009124 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009125 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009126 Py_ssize_t size, i;
9127 int kind;
9128 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009129 _PyUnicodeWriter writer;
9130 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009131 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009132 PyObject *errorHandler = NULL;
9133 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009134 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009135 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009136
Guido van Rossumd57fd912000-03-10 22:53:23 +00009137 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009138 PyErr_BadArgument();
9139 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009140 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009142 if (PyUnicode_READY(input) == -1)
9143 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009144 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009145 kind = PyUnicode_KIND(input);
9146 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009147
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009148 if (size == 0)
9149 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009150
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009151 /* allocate enough for a simple 1:1 translation without
9152 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009153 _PyUnicodeWriter_Init(&writer);
9154 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009155 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009156
Victor Stinner872b2912014-04-05 14:27:07 +02009157 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9158
Victor Stinner33798672016-03-01 21:59:58 +01009159 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009160 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009161 if (PyUnicode_IS_ASCII(input)) {
9162 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9163 if (res < 0) {
9164 _PyUnicodeWriter_Dealloc(&writer);
9165 return NULL;
9166 }
9167 if (res == 1)
9168 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009169 }
Victor Stinner33798672016-03-01 21:59:58 +01009170 else {
9171 i = 0;
9172 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009174 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009175 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009176 int translate;
9177 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9178 Py_ssize_t newpos;
9179 /* startpos for collecting untranslatable chars */
9180 Py_ssize_t collstart;
9181 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009182 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009183
Victor Stinner1194ea02014-04-04 19:37:40 +02009184 ch = PyUnicode_READ(kind, data, i);
9185 translate = charmaptranslate_output(ch, mapping, &writer);
9186 if (translate < 0)
9187 goto onError;
9188
9189 if (translate != 0) {
9190 /* it worked => adjust input pointer */
9191 ++i;
9192 continue;
9193 }
9194
9195 /* untranslatable character */
9196 collstart = i;
9197 collend = i+1;
9198
9199 /* find all untranslatable characters */
9200 while (collend < size) {
9201 PyObject *x;
9202 ch = PyUnicode_READ(kind, data, collend);
9203 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009204 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009205 Py_XDECREF(x);
9206 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009207 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009208 ++collend;
9209 }
9210
9211 if (ignore) {
9212 i = collend;
9213 }
9214 else {
9215 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9216 reason, input, &exc,
9217 collstart, collend, &newpos);
9218 if (repunicode == NULL)
9219 goto onError;
9220 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009221 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009222 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009223 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009224 Py_DECREF(repunicode);
9225 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009226 }
9227 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009228 Py_XDECREF(exc);
9229 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009230 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009231
Benjamin Peterson29060642009-01-31 22:14:21 +00009232 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009233 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009234 Py_XDECREF(exc);
9235 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009236 return NULL;
9237}
9238
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239/* Deprecated. Use PyUnicode_Translate instead. */
9240PyObject *
9241PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9242 Py_ssize_t size,
9243 PyObject *mapping,
9244 const char *errors)
9245{
Christian Heimes5f520f42012-09-11 14:03:25 +02009246 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009247 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009248 if (!unicode)
9249 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009250 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9251 Py_DECREF(unicode);
9252 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009253}
9254
Alexander Belopolsky40018472011-02-26 01:02:56 +00009255PyObject *
9256PyUnicode_Translate(PyObject *str,
9257 PyObject *mapping,
9258 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009259{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009260 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009261 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009262 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009263}
Tim Petersced69f82003-09-16 20:30:58 +00009264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009265PyObject *
9266_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9267{
9268 if (!PyUnicode_Check(unicode)) {
9269 PyErr_BadInternalCall();
9270 return NULL;
9271 }
9272 if (PyUnicode_READY(unicode) == -1)
9273 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009274 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275 /* If the string is already ASCII, just return the same string */
9276 Py_INCREF(unicode);
9277 return unicode;
9278 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009279
9280 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9281 PyObject *result = PyUnicode_New(len, 127);
9282 if (result == NULL) {
9283 return NULL;
9284 }
9285
9286 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9287 int kind = PyUnicode_KIND(unicode);
9288 const void *data = PyUnicode_DATA(unicode);
9289 Py_ssize_t i;
9290 for (i = 0; i < len; ++i) {
9291 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9292 if (ch < 127) {
9293 out[i] = ch;
9294 }
9295 else if (Py_UNICODE_ISSPACE(ch)) {
9296 out[i] = ' ';
9297 }
9298 else {
9299 int decimal = Py_UNICODE_TODECIMAL(ch);
9300 if (decimal < 0) {
9301 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009302 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009303 _PyUnicode_LENGTH(result) = i + 1;
9304 break;
9305 }
9306 out[i] = '0' + decimal;
9307 }
9308 }
9309
INADA Naoki16dfca42018-07-14 12:06:43 +09009310 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009311 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009312}
9313
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009314PyObject *
9315PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9316 Py_ssize_t length)
9317{
Victor Stinnerf0124502011-11-21 23:12:56 +01009318 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009319 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009320 Py_UCS4 maxchar;
9321 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009322 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009323
Victor Stinner99d7ad02012-02-22 13:37:39 +01009324 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009325 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009326 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009327 if (ch > 127) {
9328 int decimal = Py_UNICODE_TODECIMAL(ch);
9329 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009330 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009331 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009332 }
9333 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009334
9335 /* Copy to a new string */
9336 decimal = PyUnicode_New(length, maxchar);
9337 if (decimal == NULL)
9338 return decimal;
9339 kind = PyUnicode_KIND(decimal);
9340 data = PyUnicode_DATA(decimal);
9341 /* Iterate over code points */
9342 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009343 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009344 if (ch > 127) {
9345 int decimal = Py_UNICODE_TODECIMAL(ch);
9346 if (decimal >= 0)
9347 ch = '0' + decimal;
9348 }
9349 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009350 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009351 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009352}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009353/* --- Decimal Encoder ---------------------------------------------------- */
9354
Alexander Belopolsky40018472011-02-26 01:02:56 +00009355int
9356PyUnicode_EncodeDecimal(Py_UNICODE *s,
9357 Py_ssize_t length,
9358 char *output,
9359 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009360{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009361 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009362 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009363 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009364 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009365
9366 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009367 PyErr_BadArgument();
9368 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009369 }
9370
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009371 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009372 if (unicode == NULL)
9373 return -1;
9374
Victor Stinner42bf7752011-11-21 22:52:58 +01009375 kind = PyUnicode_KIND(unicode);
9376 data = PyUnicode_DATA(unicode);
9377
Victor Stinnerb84d7232011-11-22 01:50:07 +01009378 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009379 PyObject *exc;
9380 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009381 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009382 Py_ssize_t startpos;
9383
9384 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009385
Benjamin Peterson29060642009-01-31 22:14:21 +00009386 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009387 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009388 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009389 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009390 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009391 decimal = Py_UNICODE_TODECIMAL(ch);
9392 if (decimal >= 0) {
9393 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009394 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009395 continue;
9396 }
9397 if (0 < ch && ch < 256) {
9398 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009399 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009400 continue;
9401 }
Victor Stinner6345be92011-11-25 20:09:01 +01009402
Victor Stinner42bf7752011-11-21 22:52:58 +01009403 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009404 exc = NULL;
9405 raise_encode_exception(&exc, "decimal", unicode,
9406 startpos, startpos+1,
9407 "invalid decimal Unicode string");
9408 Py_XDECREF(exc);
9409 Py_DECREF(unicode);
9410 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009411 }
9412 /* 0-terminate the output string */
9413 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009414 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009415 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009416}
9417
Guido van Rossumd57fd912000-03-10 22:53:23 +00009418/* --- Helpers ------------------------------------------------------------ */
9419
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009420/* helper macro to fixup start/end slice values */
9421#define ADJUST_INDICES(start, end, len) \
9422 if (end > len) \
9423 end = len; \
9424 else if (end < 0) { \
9425 end += len; \
9426 if (end < 0) \
9427 end = 0; \
9428 } \
9429 if (start < 0) { \
9430 start += len; \
9431 if (start < 0) \
9432 start = 0; \
9433 }
9434
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009435static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009436any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009437 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009438 Py_ssize_t end,
9439 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009440{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009441 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009442 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443 Py_ssize_t len1, len2, result;
9444
9445 kind1 = PyUnicode_KIND(s1);
9446 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009447 if (kind1 < kind2)
9448 return -1;
9449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 len1 = PyUnicode_GET_LENGTH(s1);
9451 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009452 ADJUST_INDICES(start, end, len1);
9453 if (end - start < len2)
9454 return -1;
9455
9456 buf1 = PyUnicode_DATA(s1);
9457 buf2 = PyUnicode_DATA(s2);
9458 if (len2 == 1) {
9459 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9460 result = findchar((const char *)buf1 + kind1*start,
9461 kind1, end - start, ch, direction);
9462 if (result == -1)
9463 return -1;
9464 else
9465 return start + result;
9466 }
9467
9468 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009469 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009470 if (!buf2)
9471 return -2;
9472 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473
Victor Stinner794d5672011-10-10 03:21:36 +02009474 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009475 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009476 case PyUnicode_1BYTE_KIND:
9477 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9478 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9479 else
9480 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9481 break;
9482 case PyUnicode_2BYTE_KIND:
9483 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9484 break;
9485 case PyUnicode_4BYTE_KIND:
9486 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9487 break;
9488 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009489 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009490 }
9491 }
9492 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009493 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009494 case PyUnicode_1BYTE_KIND:
9495 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9496 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9497 else
9498 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9499 break;
9500 case PyUnicode_2BYTE_KIND:
9501 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9502 break;
9503 case PyUnicode_4BYTE_KIND:
9504 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9505 break;
9506 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009507 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009508 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009509 }
9510
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009511 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009512 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009513 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514
9515 return result;
9516}
9517
Victor Stinner59423e32018-11-26 13:40:01 +01009518/* _PyUnicode_InsertThousandsGrouping() helper functions */
9519#include "stringlib/localeutil.h"
9520
9521/**
9522 * InsertThousandsGrouping:
9523 * @writer: Unicode writer.
9524 * @n_buffer: Number of characters in @buffer.
9525 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9526 * @d_pos: Start of digits string.
9527 * @n_digits: The number of digits in the string, in which we want
9528 * to put the grouping chars.
9529 * @min_width: The minimum width of the digits in the output string.
9530 * Output will be zero-padded on the left to fill.
9531 * @grouping: see definition in localeconv().
9532 * @thousands_sep: see definition in localeconv().
9533 *
9534 * There are 2 modes: counting and filling. If @writer is NULL,
9535 * we are in counting mode, else filling mode.
9536 * If counting, the required buffer size is returned.
9537 * If filling, we know the buffer will be large enough, so we don't
9538 * need to pass in the buffer size.
9539 * Inserts thousand grouping characters (as defined by grouping and
9540 * thousands_sep) into @writer.
9541 *
9542 * Return value: -1 on error, number of characters otherwise.
9543 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009544Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009545_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009546 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009547 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009548 PyObject *digits,
9549 Py_ssize_t d_pos,
9550 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009551 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009552 const char *grouping,
9553 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009554 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009555{
Xtreak3f7983a2019-01-07 20:39:14 +05309556 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009557 if (writer) {
9558 assert(digits != NULL);
9559 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009560 }
9561 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009562 assert(digits == NULL);
9563 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009564 }
Victor Stinner59423e32018-11-26 13:40:01 +01009565 assert(0 <= d_pos);
9566 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009567 assert(grouping != NULL);
9568
9569 if (digits != NULL) {
9570 if (PyUnicode_READY(digits) == -1) {
9571 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009572 }
Victor Stinner59423e32018-11-26 13:40:01 +01009573 }
9574 if (PyUnicode_READY(thousands_sep) == -1) {
9575 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009576 }
9577
Victor Stinner59423e32018-11-26 13:40:01 +01009578 Py_ssize_t count = 0;
9579 Py_ssize_t n_zeros;
9580 int loop_broken = 0;
9581 int use_separator = 0; /* First time through, don't append the
9582 separator. They only go between
9583 groups. */
9584 Py_ssize_t buffer_pos;
9585 Py_ssize_t digits_pos;
9586 Py_ssize_t len;
9587 Py_ssize_t n_chars;
9588 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9589 be looked at */
9590 /* A generator that returns all of the grouping widths, until it
9591 returns 0. */
9592 GroupGenerator groupgen;
9593 GroupGenerator_init(&groupgen, grouping);
9594 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9595
9596 /* if digits are not grouped, thousands separator
9597 should be an empty string */
9598 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9599
9600 digits_pos = d_pos + n_digits;
9601 if (writer) {
9602 buffer_pos = writer->pos + n_buffer;
9603 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9604 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009605 }
Victor Stinner59423e32018-11-26 13:40:01 +01009606 else {
9607 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009608 }
Victor Stinner59423e32018-11-26 13:40:01 +01009609
9610 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009611 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009612 }
Victor Stinner59423e32018-11-26 13:40:01 +01009613
9614 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9615 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9616 n_zeros = Py_MAX(0, len - remaining);
9617 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9618
9619 /* Use n_zero zero's and n_chars chars */
9620
9621 /* Count only, don't do anything. */
9622 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9623
9624 /* Copy into the writer. */
9625 InsertThousandsGrouping_fill(writer, &buffer_pos,
9626 digits, &digits_pos,
9627 n_chars, n_zeros,
9628 use_separator ? thousands_sep : NULL,
9629 thousands_sep_len, maxchar);
9630
9631 /* Use a separator next time. */
9632 use_separator = 1;
9633
9634 remaining -= n_chars;
9635 min_width -= len;
9636
9637 if (remaining <= 0 && min_width <= 0) {
9638 loop_broken = 1;
9639 break;
9640 }
9641 min_width -= thousands_sep_len;
9642 }
9643 if (!loop_broken) {
9644 /* We left the loop without using a break statement. */
9645
9646 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9647 n_zeros = Py_MAX(0, len - remaining);
9648 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9649
9650 /* Use n_zero zero's and n_chars chars */
9651 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9652
9653 /* Copy into the writer. */
9654 InsertThousandsGrouping_fill(writer, &buffer_pos,
9655 digits, &digits_pos,
9656 n_chars, n_zeros,
9657 use_separator ? thousands_sep : NULL,
9658 thousands_sep_len, maxchar);
9659 }
9660 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009661}
9662
9663
Alexander Belopolsky40018472011-02-26 01:02:56 +00009664Py_ssize_t
9665PyUnicode_Count(PyObject *str,
9666 PyObject *substr,
9667 Py_ssize_t start,
9668 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009669{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009670 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009671 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009672 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009673 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009674
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009675 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009676 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009677
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009678 kind1 = PyUnicode_KIND(str);
9679 kind2 = PyUnicode_KIND(substr);
9680 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009681 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009682
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009683 len1 = PyUnicode_GET_LENGTH(str);
9684 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009685 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009686 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009687 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009688
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009689 buf1 = PyUnicode_DATA(str);
9690 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009691 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009692 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009693 if (!buf2)
9694 goto onError;
9695 }
9696
9697 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009698 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009699 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009700 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009701 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009702 buf2, len2, PY_SSIZE_T_MAX
9703 );
9704 else
9705 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009706 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009707 buf2, len2, PY_SSIZE_T_MAX
9708 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009709 break;
9710 case PyUnicode_2BYTE_KIND:
9711 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009712 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009713 buf2, len2, PY_SSIZE_T_MAX
9714 );
9715 break;
9716 case PyUnicode_4BYTE_KIND:
9717 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009718 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009719 buf2, len2, PY_SSIZE_T_MAX
9720 );
9721 break;
9722 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009723 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009724 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009725
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009726 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009727 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009728 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009729
Guido van Rossumd57fd912000-03-10 22:53:23 +00009730 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009731 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009732 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9733 if (kind2 != kind1)
9734 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009735 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009736}
9737
Alexander Belopolsky40018472011-02-26 01:02:56 +00009738Py_ssize_t
9739PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009740 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009741 Py_ssize_t start,
9742 Py_ssize_t end,
9743 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009744{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009745 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009746 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009747
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009748 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009749}
9750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009751Py_ssize_t
9752PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9753 Py_ssize_t start, Py_ssize_t end,
9754 int direction)
9755{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009756 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009757 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009758 if (PyUnicode_READY(str) == -1)
9759 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009760 len = PyUnicode_GET_LENGTH(str);
9761 ADJUST_INDICES(start, end, len);
9762 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009763 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009764 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009765 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9766 kind, end-start, ch, direction);
9767 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009768 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009769 else
9770 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009771}
9772
Alexander Belopolsky40018472011-02-26 01:02:56 +00009773static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009774tailmatch(PyObject *self,
9775 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009776 Py_ssize_t start,
9777 Py_ssize_t end,
9778 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009779{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 int kind_self;
9781 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009782 const void *data_self;
9783 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009784 Py_ssize_t offset;
9785 Py_ssize_t i;
9786 Py_ssize_t end_sub;
9787
9788 if (PyUnicode_READY(self) == -1 ||
9789 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009790 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009792 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9793 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009794 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009795 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009796
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009797 if (PyUnicode_GET_LENGTH(substring) == 0)
9798 return 1;
9799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009800 kind_self = PyUnicode_KIND(self);
9801 data_self = PyUnicode_DATA(self);
9802 kind_sub = PyUnicode_KIND(substring);
9803 data_sub = PyUnicode_DATA(substring);
9804 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9805
9806 if (direction > 0)
9807 offset = end;
9808 else
9809 offset = start;
9810
9811 if (PyUnicode_READ(kind_self, data_self, offset) ==
9812 PyUnicode_READ(kind_sub, data_sub, 0) &&
9813 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9814 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9815 /* If both are of the same kind, memcmp is sufficient */
9816 if (kind_self == kind_sub) {
9817 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009818 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009819 data_sub,
9820 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009821 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009822 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009823 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009824 else {
9825 /* We do not need to compare 0 and len(substring)-1 because
9826 the if statement above ensured already that they are equal
9827 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 for (i = 1; i < end_sub; ++i) {
9829 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9830 PyUnicode_READ(kind_sub, data_sub, i))
9831 return 0;
9832 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009833 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009834 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009835 }
9836
9837 return 0;
9838}
9839
Alexander Belopolsky40018472011-02-26 01:02:56 +00009840Py_ssize_t
9841PyUnicode_Tailmatch(PyObject *str,
9842 PyObject *substr,
9843 Py_ssize_t start,
9844 Py_ssize_t end,
9845 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009846{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009847 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009848 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009849
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009850 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009851}
9852
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009853static PyObject *
9854ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009855{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009856 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009857 const char *data = PyUnicode_DATA(self);
9858 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009859 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009860
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009861 res = PyUnicode_New(len, 127);
9862 if (res == NULL)
9863 return NULL;
9864 resdata = PyUnicode_DATA(res);
9865 if (lower)
9866 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009868 _Py_bytes_upper(resdata, data, len);
9869 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009870}
9871
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009872static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009873handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009874{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009875 Py_ssize_t j;
9876 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009877 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009878 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009879
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009880 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9881
9882 where ! is a negation and \p{xxx} is a character with property xxx.
9883 */
9884 for (j = i - 1; j >= 0; j--) {
9885 c = PyUnicode_READ(kind, data, j);
9886 if (!_PyUnicode_IsCaseIgnorable(c))
9887 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009888 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009889 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9890 if (final_sigma) {
9891 for (j = i + 1; j < length; j++) {
9892 c = PyUnicode_READ(kind, data, j);
9893 if (!_PyUnicode_IsCaseIgnorable(c))
9894 break;
9895 }
9896 final_sigma = j == length || !_PyUnicode_IsCased(c);
9897 }
9898 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009899}
9900
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009901static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009902lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009903 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009904{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009905 /* Obscure special case. */
9906 if (c == 0x3A3) {
9907 mapped[0] = handle_capital_sigma(kind, data, length, i);
9908 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009909 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009910 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009911}
9912
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009913static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009914do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009915{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009916 Py_ssize_t i, k = 0;
9917 int n_res, j;
9918 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009919
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009920 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009921 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009922 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009923 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009924 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009925 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009926 for (i = 1; i < length; i++) {
9927 c = PyUnicode_READ(kind, data, i);
9928 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9929 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009930 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009931 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009932 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009933 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009934 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009935}
9936
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009937static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009938do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009939 Py_ssize_t i, k = 0;
9940
9941 for (i = 0; i < length; i++) {
9942 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9943 int n_res, j;
9944 if (Py_UNICODE_ISUPPER(c)) {
9945 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9946 }
9947 else if (Py_UNICODE_ISLOWER(c)) {
9948 n_res = _PyUnicode_ToUpperFull(c, mapped);
9949 }
9950 else {
9951 n_res = 1;
9952 mapped[0] = c;
9953 }
9954 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009955 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009956 res[k++] = mapped[j];
9957 }
9958 }
9959 return k;
9960}
9961
9962static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009963do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009964 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009965{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009966 Py_ssize_t i, k = 0;
9967
9968 for (i = 0; i < length; i++) {
9969 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9970 int n_res, j;
9971 if (lower)
9972 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9973 else
9974 n_res = _PyUnicode_ToUpperFull(c, mapped);
9975 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009976 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009977 res[k++] = mapped[j];
9978 }
9979 }
9980 return k;
9981}
9982
9983static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009984do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009985{
9986 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9987}
9988
9989static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009990do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009991{
9992 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9993}
9994
Benjamin Petersone51757f2012-01-12 21:10:29 -05009995static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009996do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -05009997{
9998 Py_ssize_t i, k = 0;
9999
10000 for (i = 0; i < length; i++) {
10001 Py_UCS4 c = PyUnicode_READ(kind, data, i);
10002 Py_UCS4 mapped[3];
10003 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10004 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010005 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010006 res[k++] = mapped[j];
10007 }
10008 }
10009 return k;
10010}
10011
10012static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010013do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -050010014{
10015 Py_ssize_t i, k = 0;
10016 int previous_is_cased;
10017
10018 previous_is_cased = 0;
10019 for (i = 0; i < length; i++) {
10020 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10021 Py_UCS4 mapped[3];
10022 int n_res, j;
10023
10024 if (previous_is_cased)
10025 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10026 else
10027 n_res = _PyUnicode_ToTitleFull(c, mapped);
10028
10029 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010030 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -050010031 res[k++] = mapped[j];
10032 }
10033
10034 previous_is_cased = _PyUnicode_IsCased(c);
10035 }
10036 return k;
10037}
10038
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010039static PyObject *
10040case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010041 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010042{
10043 PyObject *res = NULL;
10044 Py_ssize_t length, newlength = 0;
10045 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010046 const void *data;
10047 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010048 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10049
Benjamin Petersoneea48462012-01-16 14:28:50 -050010050 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010051
10052 kind = PyUnicode_KIND(self);
10053 data = PyUnicode_DATA(self);
10054 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010055 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010056 PyErr_SetString(PyExc_OverflowError, "string is too long");
10057 return NULL;
10058 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -040010059 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010060 if (tmp == NULL)
10061 return PyErr_NoMemory();
10062 newlength = perform(kind, data, length, tmp, &maxchar);
10063 res = PyUnicode_New(newlength, maxchar);
10064 if (res == NULL)
10065 goto leave;
10066 tmpend = tmp + newlength;
10067 outdata = PyUnicode_DATA(res);
10068 outkind = PyUnicode_KIND(res);
10069 switch (outkind) {
10070 case PyUnicode_1BYTE_KIND:
10071 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10072 break;
10073 case PyUnicode_2BYTE_KIND:
10074 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10075 break;
10076 case PyUnicode_4BYTE_KIND:
10077 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10078 break;
10079 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010080 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010081 }
10082 leave:
10083 PyMem_FREE(tmp);
10084 return res;
10085}
10086
Tim Peters8ce9f162004-08-27 01:49:32 +000010087PyObject *
10088PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010089{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010090 PyObject *res;
10091 PyObject *fseq;
10092 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010093 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010094
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010095 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010096 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010097 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010098 }
10099
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010100 /* NOTE: the following code can't call back into Python code,
10101 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010102 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010103
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010104 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010105 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010106 res = _PyUnicode_JoinArray(separator, items, seqlen);
10107 Py_DECREF(fseq);
10108 return res;
10109}
10110
10111PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010112_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010113{
10114 PyObject *res = NULL; /* the result */
10115 PyObject *sep = NULL;
10116 Py_ssize_t seplen;
10117 PyObject *item;
10118 Py_ssize_t sz, i, res_offset;
10119 Py_UCS4 maxchar;
10120 Py_UCS4 item_maxchar;
10121 int use_memcpy;
10122 unsigned char *res_data = NULL, *sep_data = NULL;
10123 PyObject *last_obj;
10124 unsigned int kind = 0;
10125
Tim Peters05eba1f2004-08-27 21:32:02 +000010126 /* If empty sequence, return u"". */
10127 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010128 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010129 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010130
Tim Peters05eba1f2004-08-27 21:32:02 +000010131 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010132 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010133 if (seqlen == 1) {
10134 if (PyUnicode_CheckExact(items[0])) {
10135 res = items[0];
10136 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010137 return res;
10138 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010139 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010140 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010141 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010142 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010143 /* Set up sep and seplen */
10144 if (separator == NULL) {
10145 /* fall back to a blank space separator */
10146 sep = PyUnicode_FromOrdinal(' ');
10147 if (!sep)
10148 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010149 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010150 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010151 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010152 else {
10153 if (!PyUnicode_Check(separator)) {
10154 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010155 "separator: expected str instance,"
10156 " %.80s found",
10157 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010158 goto onError;
10159 }
10160 if (PyUnicode_READY(separator))
10161 goto onError;
10162 sep = separator;
10163 seplen = PyUnicode_GET_LENGTH(separator);
10164 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10165 /* inc refcount to keep this code path symmetric with the
10166 above case of a blank separator */
10167 Py_INCREF(sep);
10168 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010169 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010170 }
10171
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010172 /* There are at least two things to join, or else we have a subclass
10173 * of str in the sequence.
10174 * Do a pre-pass to figure out the total amount of space we'll
10175 * need (sz), and see whether all argument are strings.
10176 */
10177 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010178#ifdef Py_DEBUG
10179 use_memcpy = 0;
10180#else
10181 use_memcpy = 1;
10182#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010183 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010184 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010185 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010186 if (!PyUnicode_Check(item)) {
10187 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010188 "sequence item %zd: expected str instance,"
10189 " %.80s found",
10190 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010191 goto onError;
10192 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010193 if (PyUnicode_READY(item) == -1)
10194 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010195 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010196 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010197 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010198 if (i != 0) {
10199 add_sz += seplen;
10200 }
10201 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010202 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010203 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010204 goto onError;
10205 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010206 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010207 if (use_memcpy && last_obj != NULL) {
10208 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10209 use_memcpy = 0;
10210 }
10211 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010212 }
Tim Petersced69f82003-09-16 20:30:58 +000010213
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010215 if (res == NULL)
10216 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010217
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010218 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010219#ifdef Py_DEBUG
10220 use_memcpy = 0;
10221#else
10222 if (use_memcpy) {
10223 res_data = PyUnicode_1BYTE_DATA(res);
10224 kind = PyUnicode_KIND(res);
10225 if (seplen != 0)
10226 sep_data = PyUnicode_1BYTE_DATA(sep);
10227 }
10228#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010229 if (use_memcpy) {
10230 for (i = 0; i < seqlen; ++i) {
10231 Py_ssize_t itemlen;
10232 item = items[i];
10233
10234 /* Copy item, and maybe the separator. */
10235 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010236 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010237 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010238 kind * seplen);
10239 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010240 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010241
10242 itemlen = PyUnicode_GET_LENGTH(item);
10243 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010244 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010245 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010246 kind * itemlen);
10247 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010248 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010249 }
10250 assert(res_data == PyUnicode_1BYTE_DATA(res)
10251 + kind * PyUnicode_GET_LENGTH(res));
10252 }
10253 else {
10254 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10255 Py_ssize_t itemlen;
10256 item = items[i];
10257
10258 /* Copy item, and maybe the separator. */
10259 if (i && seplen != 0) {
10260 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10261 res_offset += seplen;
10262 }
10263
10264 itemlen = PyUnicode_GET_LENGTH(item);
10265 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010266 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010267 res_offset += itemlen;
10268 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010269 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010270 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010271 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010273 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010274 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010276
Benjamin Peterson29060642009-01-31 22:14:21 +000010277 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010278 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010279 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010280 return NULL;
10281}
10282
Victor Stinnerd3f08822012-05-29 12:57:52 +020010283void
10284_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10285 Py_UCS4 fill_char)
10286{
10287 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010288 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010289 assert(PyUnicode_IS_READY(unicode));
10290 assert(unicode_modifiable(unicode));
10291 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10292 assert(start >= 0);
10293 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010294 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010295}
10296
Victor Stinner3fe55312012-01-04 00:33:50 +010010297Py_ssize_t
10298PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10299 Py_UCS4 fill_char)
10300{
10301 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010302
10303 if (!PyUnicode_Check(unicode)) {
10304 PyErr_BadInternalCall();
10305 return -1;
10306 }
10307 if (PyUnicode_READY(unicode) == -1)
10308 return -1;
10309 if (unicode_check_modifiable(unicode))
10310 return -1;
10311
Victor Stinnerd3f08822012-05-29 12:57:52 +020010312 if (start < 0) {
10313 PyErr_SetString(PyExc_IndexError, "string index out of range");
10314 return -1;
10315 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010316 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10317 PyErr_SetString(PyExc_ValueError,
10318 "fill character is bigger than "
10319 "the string maximum character");
10320 return -1;
10321 }
10322
10323 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10324 length = Py_MIN(maxlen, length);
10325 if (length <= 0)
10326 return 0;
10327
Victor Stinnerd3f08822012-05-29 12:57:52 +020010328 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010329 return length;
10330}
10331
Victor Stinner9310abb2011-10-05 00:59:23 +020010332static PyObject *
10333pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010334 Py_ssize_t left,
10335 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010337{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 PyObject *u;
10339 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010340 int kind;
10341 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010342
10343 if (left < 0)
10344 left = 0;
10345 if (right < 0)
10346 right = 0;
10347
Victor Stinnerc4b49542011-12-11 22:44:26 +010010348 if (left == 0 && right == 0)
10349 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10352 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010353 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10354 return NULL;
10355 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010357 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010358 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010359 if (!u)
10360 return NULL;
10361
10362 kind = PyUnicode_KIND(u);
10363 data = PyUnicode_DATA(u);
10364 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010365 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010366 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010367 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010368 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010369 assert(_PyUnicode_CheckConsistency(u, 1));
10370 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010371}
10372
Alexander Belopolsky40018472011-02-26 01:02:56 +000010373PyObject *
10374PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010375{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010377
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010378 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010379 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010380
Benjamin Petersonead6b532011-12-20 17:23:42 -060010381 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010383 if (PyUnicode_IS_ASCII(string))
10384 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010385 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010386 PyUnicode_GET_LENGTH(string), keepends);
10387 else
10388 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010389 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010390 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010391 break;
10392 case PyUnicode_2BYTE_KIND:
10393 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010394 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395 PyUnicode_GET_LENGTH(string), keepends);
10396 break;
10397 case PyUnicode_4BYTE_KIND:
10398 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010399 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400 PyUnicode_GET_LENGTH(string), keepends);
10401 break;
10402 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010403 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010405 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010406}
10407
Alexander Belopolsky40018472011-02-26 01:02:56 +000010408static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010409split(PyObject *self,
10410 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010411 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010412{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010413 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010414 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 Py_ssize_t len1, len2;
10416 PyObject* out;
10417
Guido van Rossumd57fd912000-03-10 22:53:23 +000010418 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010419 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010421 if (PyUnicode_READY(self) == -1)
10422 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010425 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010427 if (PyUnicode_IS_ASCII(self))
10428 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010429 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010430 PyUnicode_GET_LENGTH(self), maxcount
10431 );
10432 else
10433 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010434 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010435 PyUnicode_GET_LENGTH(self), maxcount
10436 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437 case PyUnicode_2BYTE_KIND:
10438 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010439 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010440 PyUnicode_GET_LENGTH(self), maxcount
10441 );
10442 case PyUnicode_4BYTE_KIND:
10443 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010444 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010445 PyUnicode_GET_LENGTH(self), maxcount
10446 );
10447 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010448 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 }
10450
10451 if (PyUnicode_READY(substring) == -1)
10452 return NULL;
10453
10454 kind1 = PyUnicode_KIND(self);
10455 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 len1 = PyUnicode_GET_LENGTH(self);
10457 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010458 if (kind1 < kind2 || len1 < len2) {
10459 out = PyList_New(1);
10460 if (out == NULL)
10461 return NULL;
10462 Py_INCREF(self);
10463 PyList_SET_ITEM(out, 0, self);
10464 return out;
10465 }
10466 buf1 = PyUnicode_DATA(self);
10467 buf2 = PyUnicode_DATA(substring);
10468 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010469 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010470 if (!buf2)
10471 return NULL;
10472 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010474 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010476 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10477 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010478 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010479 else
10480 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010481 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 break;
10483 case PyUnicode_2BYTE_KIND:
10484 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010485 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 break;
10487 case PyUnicode_4BYTE_KIND:
10488 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010489 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490 break;
10491 default:
10492 out = NULL;
10493 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010494 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010495 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010496 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010497 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010498}
10499
Alexander Belopolsky40018472011-02-26 01:02:56 +000010500static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010501rsplit(PyObject *self,
10502 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010503 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010504{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010505 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010506 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 Py_ssize_t len1, len2;
10508 PyObject* out;
10509
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010510 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010511 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010512
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 if (PyUnicode_READY(self) == -1)
10514 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010517 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010519 if (PyUnicode_IS_ASCII(self))
10520 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010521 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010522 PyUnicode_GET_LENGTH(self), maxcount
10523 );
10524 else
10525 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010526 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010527 PyUnicode_GET_LENGTH(self), maxcount
10528 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010529 case PyUnicode_2BYTE_KIND:
10530 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010531 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010532 PyUnicode_GET_LENGTH(self), maxcount
10533 );
10534 case PyUnicode_4BYTE_KIND:
10535 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010536 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 PyUnicode_GET_LENGTH(self), maxcount
10538 );
10539 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010540 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010541 }
10542
10543 if (PyUnicode_READY(substring) == -1)
10544 return NULL;
10545
10546 kind1 = PyUnicode_KIND(self);
10547 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 len1 = PyUnicode_GET_LENGTH(self);
10549 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010550 if (kind1 < kind2 || len1 < len2) {
10551 out = PyList_New(1);
10552 if (out == NULL)
10553 return NULL;
10554 Py_INCREF(self);
10555 PyList_SET_ITEM(out, 0, self);
10556 return out;
10557 }
10558 buf1 = PyUnicode_DATA(self);
10559 buf2 = PyUnicode_DATA(substring);
10560 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010561 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010562 if (!buf2)
10563 return NULL;
10564 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010566 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010567 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010568 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10569 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010570 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010571 else
10572 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010573 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 break;
10575 case PyUnicode_2BYTE_KIND:
10576 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010577 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010578 break;
10579 case PyUnicode_4BYTE_KIND:
10580 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010581 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 break;
10583 default:
10584 out = NULL;
10585 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010586 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010587 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010588 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 return out;
10590}
10591
10592static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010593anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10594 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010596 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010598 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10599 return asciilib_find(buf1, len1, buf2, len2, offset);
10600 else
10601 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 case PyUnicode_2BYTE_KIND:
10603 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10604 case PyUnicode_4BYTE_KIND:
10605 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10606 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010607 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608}
10609
10610static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010611anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10612 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010614 switch (kind) {
10615 case PyUnicode_1BYTE_KIND:
10616 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10617 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10618 else
10619 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10620 case PyUnicode_2BYTE_KIND:
10621 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10622 case PyUnicode_4BYTE_KIND:
10623 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10624 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010625 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010626}
10627
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010628static void
10629replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10630 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10631{
10632 int kind = PyUnicode_KIND(u);
10633 void *data = PyUnicode_DATA(u);
10634 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10635 if (kind == PyUnicode_1BYTE_KIND) {
10636 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10637 (Py_UCS1 *)data + len,
10638 u1, u2, maxcount);
10639 }
10640 else if (kind == PyUnicode_2BYTE_KIND) {
10641 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10642 (Py_UCS2 *)data + len,
10643 u1, u2, maxcount);
10644 }
10645 else {
10646 assert(kind == PyUnicode_4BYTE_KIND);
10647 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10648 (Py_UCS4 *)data + len,
10649 u1, u2, maxcount);
10650 }
10651}
10652
Alexander Belopolsky40018472011-02-26 01:02:56 +000010653static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654replace(PyObject *self, PyObject *str1,
10655 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010656{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010658 const char *sbuf = PyUnicode_DATA(self);
10659 const void *buf1 = PyUnicode_DATA(str1);
10660 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010661 int srelease = 0, release1 = 0, release2 = 0;
10662 int skind = PyUnicode_KIND(self);
10663 int kind1 = PyUnicode_KIND(str1);
10664 int kind2 = PyUnicode_KIND(str2);
10665 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10666 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10667 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010668 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010669 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010670
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010671 if (slen < len1)
10672 goto nothing;
10673
Guido van Rossumd57fd912000-03-10 22:53:23 +000010674 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010675 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010676 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010677 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010678
Victor Stinner59de0ee2011-10-07 10:01:28 +020010679 if (str1 == str2)
10680 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681
Victor Stinner49a0a212011-10-12 23:46:10 +020010682 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010683 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10684 if (maxchar < maxchar_str1)
10685 /* substring too wide to be present */
10686 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010687 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10688 /* Replacing str1 with str2 may cause a maxchar reduction in the
10689 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010690 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010691 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010692
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010694 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010696 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010698 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010699 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010700 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010701
Victor Stinner69ed0f42013-04-09 21:48:24 +020010702 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010703 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010704 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010705 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010706 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010707 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010708 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010709 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010710
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010711 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10712 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010713 }
10714 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010715 int rkind = skind;
10716 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010717 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 if (kind1 < rkind) {
10720 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010721 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010722 if (!buf1) goto error;
10723 release1 = 1;
10724 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010725 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010726 if (i < 0)
10727 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 if (rkind > kind2) {
10729 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010730 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010731 if (!buf2) goto error;
10732 release2 = 1;
10733 }
10734 else if (rkind < kind2) {
10735 /* widen self and buf1 */
10736 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010737 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010738 assert(buf1 != PyUnicode_DATA(str1));
10739 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010740 buf1 = PyUnicode_DATA(str1);
10741 release1 = 0;
10742 }
10743 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010744 if (!sbuf) goto error;
10745 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010746 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010747 if (!buf1) goto error;
10748 release1 = 1;
10749 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010750 u = PyUnicode_New(slen, maxchar);
10751 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010752 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010753 assert(PyUnicode_KIND(u) == rkind);
10754 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010755
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010756 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010757 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010758 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010759 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010760 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010761 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010762
10763 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010764 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010765 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010766 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010767 if (i == -1)
10768 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010769 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010770 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010771 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010772 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010773 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010775 }
10776 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010778 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010779 int rkind = skind;
10780 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010782 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010783 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010784 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010785 if (!buf1) goto error;
10786 release1 = 1;
10787 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010788 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010789 if (n == 0)
10790 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010791 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010792 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010793 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794 if (!buf2) goto error;
10795 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010796 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010797 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010798 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010800 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010801 if (!sbuf) goto error;
10802 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010803 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010804 assert(buf1 != PyUnicode_DATA(str1));
10805 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010806 buf1 = PyUnicode_DATA(str1);
10807 release1 = 0;
10808 }
10809 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010810 if (!buf1) goto error;
10811 release1 = 1;
10812 }
10813 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10814 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010815 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010816 PyErr_SetString(PyExc_OverflowError,
10817 "replace string is too long");
10818 goto error;
10819 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010820 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010821 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010822 _Py_INCREF_UNICODE_EMPTY();
10823 if (!unicode_empty)
10824 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010825 u = unicode_empty;
10826 goto done;
10827 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010828 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010829 PyErr_SetString(PyExc_OverflowError,
10830 "replace string is too long");
10831 goto error;
10832 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010833 u = PyUnicode_New(new_size, maxchar);
10834 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010835 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010836 assert(PyUnicode_KIND(u) == rkind);
10837 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010838 ires = i = 0;
10839 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010840 while (n-- > 0) {
10841 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010842 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010843 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010844 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010845 if (j == -1)
10846 break;
10847 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010848 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010849 memcpy(res + rkind * ires,
10850 sbuf + rkind * i,
10851 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010852 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010853 }
10854 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010855 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010856 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010857 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010858 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010859 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010860 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010861 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010862 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010863 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010864 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010865 memcpy(res + rkind * ires,
10866 sbuf + rkind * i,
10867 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010868 }
10869 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010870 /* interleave */
10871 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010872 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010874 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010875 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010876 if (--n <= 0)
10877 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010878 memcpy(res + rkind * ires,
10879 sbuf + rkind * i,
10880 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010881 ires++;
10882 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010883 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010884 memcpy(res + rkind * ires,
10885 sbuf + rkind * i,
10886 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010887 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010888 }
10889
10890 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010891 unicode_adjust_maxchar(&u);
10892 if (u == NULL)
10893 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010894 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010895
10896 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010897 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10898 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10899 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010900 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010901 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010902 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010903 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010904 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010905 PyMem_FREE((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010906 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010907 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010908
Benjamin Peterson29060642009-01-31 22:14:21 +000010909 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010910 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010911 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10912 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10913 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010914 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010915 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010916 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010917 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010918 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010919 PyMem_FREE((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010920 return unicode_result_unchanged(self);
10921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010922 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010923 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10924 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10925 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10926 if (srelease)
10927 PyMem_FREE((void *)sbuf);
10928 if (release1)
10929 PyMem_FREE((void *)buf1);
10930 if (release2)
10931 PyMem_FREE((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010932 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010933}
10934
10935/* --- Unicode Object Methods --------------------------------------------- */
10936
INADA Naoki3ae20562017-01-16 20:41:20 +090010937/*[clinic input]
10938str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010939
INADA Naoki3ae20562017-01-16 20:41:20 +090010940Return a version of the string where each word is titlecased.
10941
10942More specifically, words start with uppercased characters and all remaining
10943cased characters have lower case.
10944[clinic start generated code]*/
10945
10946static PyObject *
10947unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010948/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010949{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010950 if (PyUnicode_READY(self) == -1)
10951 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010952 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953}
10954
INADA Naoki3ae20562017-01-16 20:41:20 +090010955/*[clinic input]
10956str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010957
INADA Naoki3ae20562017-01-16 20:41:20 +090010958Return a capitalized version of the string.
10959
10960More specifically, make the first character have upper case and the rest lower
10961case.
10962[clinic start generated code]*/
10963
10964static PyObject *
10965unicode_capitalize_impl(PyObject *self)
10966/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010968 if (PyUnicode_READY(self) == -1)
10969 return NULL;
10970 if (PyUnicode_GET_LENGTH(self) == 0)
10971 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010972 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973}
10974
INADA Naoki3ae20562017-01-16 20:41:20 +090010975/*[clinic input]
10976str.casefold as unicode_casefold
10977
10978Return a version of the string suitable for caseless comparisons.
10979[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010980
10981static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010982unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010983/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010984{
10985 if (PyUnicode_READY(self) == -1)
10986 return NULL;
10987 if (PyUnicode_IS_ASCII(self))
10988 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010989 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010990}
10991
10992
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010993/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010994
10995static int
10996convert_uc(PyObject *obj, void *addr)
10997{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010998 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010999
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011000 if (!PyUnicode_Check(obj)) {
11001 PyErr_Format(PyExc_TypeError,
11002 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020011003 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011004 return 0;
11005 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011006 if (PyUnicode_READY(obj) < 0)
11007 return 0;
11008 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011009 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011010 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000011011 return 0;
11012 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011013 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011014 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011015}
11016
INADA Naoki3ae20562017-01-16 20:41:20 +090011017/*[clinic input]
11018str.center as unicode_center
11019
11020 width: Py_ssize_t
11021 fillchar: Py_UCS4 = ' '
11022 /
11023
11024Return a centered string of length width.
11025
11026Padding is done using the specified fill character (default is a space).
11027[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011028
11029static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011030unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11031/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011032{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011033 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011034
Benjamin Petersonbac79492012-01-14 13:34:47 -050011035 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011036 return NULL;
11037
Victor Stinnerc4b49542011-12-11 22:44:26 +010011038 if (PyUnicode_GET_LENGTH(self) >= width)
11039 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011040
Victor Stinnerc4b49542011-12-11 22:44:26 +010011041 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011042 left = marg / 2 + (marg & width & 1);
11043
Victor Stinner9310abb2011-10-05 00:59:23 +020011044 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045}
11046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011047/* This function assumes that str1 and str2 are readied by the caller. */
11048
Marc-André Lemburge5034372000-08-08 08:04:29 +000011049static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011050unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011051{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011052#define COMPARE(TYPE1, TYPE2) \
11053 do { \
11054 TYPE1* p1 = (TYPE1 *)data1; \
11055 TYPE2* p2 = (TYPE2 *)data2; \
11056 TYPE1* end = p1 + len; \
11057 Py_UCS4 c1, c2; \
11058 for (; p1 != end; p1++, p2++) { \
11059 c1 = *p1; \
11060 c2 = *p2; \
11061 if (c1 != c2) \
11062 return (c1 < c2) ? -1 : 1; \
11063 } \
11064 } \
11065 while (0)
11066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011067 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011068 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011069 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011070
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011071 kind1 = PyUnicode_KIND(str1);
11072 kind2 = PyUnicode_KIND(str2);
11073 data1 = PyUnicode_DATA(str1);
11074 data2 = PyUnicode_DATA(str2);
11075 len1 = PyUnicode_GET_LENGTH(str1);
11076 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011077 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011078
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011079 switch(kind1) {
11080 case PyUnicode_1BYTE_KIND:
11081 {
11082 switch(kind2) {
11083 case PyUnicode_1BYTE_KIND:
11084 {
11085 int cmp = memcmp(data1, data2, len);
11086 /* normalize result of memcmp() into the range [-1; 1] */
11087 if (cmp < 0)
11088 return -1;
11089 if (cmp > 0)
11090 return 1;
11091 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011092 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011093 case PyUnicode_2BYTE_KIND:
11094 COMPARE(Py_UCS1, Py_UCS2);
11095 break;
11096 case PyUnicode_4BYTE_KIND:
11097 COMPARE(Py_UCS1, Py_UCS4);
11098 break;
11099 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011100 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011101 }
11102 break;
11103 }
11104 case PyUnicode_2BYTE_KIND:
11105 {
11106 switch(kind2) {
11107 case PyUnicode_1BYTE_KIND:
11108 COMPARE(Py_UCS2, Py_UCS1);
11109 break;
11110 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011111 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011112 COMPARE(Py_UCS2, Py_UCS2);
11113 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011114 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011115 case PyUnicode_4BYTE_KIND:
11116 COMPARE(Py_UCS2, Py_UCS4);
11117 break;
11118 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011119 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011120 }
11121 break;
11122 }
11123 case PyUnicode_4BYTE_KIND:
11124 {
11125 switch(kind2) {
11126 case PyUnicode_1BYTE_KIND:
11127 COMPARE(Py_UCS4, Py_UCS1);
11128 break;
11129 case PyUnicode_2BYTE_KIND:
11130 COMPARE(Py_UCS4, Py_UCS2);
11131 break;
11132 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011133 {
11134#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11135 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11136 /* normalize result of wmemcmp() into the range [-1; 1] */
11137 if (cmp < 0)
11138 return -1;
11139 if (cmp > 0)
11140 return 1;
11141#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011142 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011143#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011144 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011145 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011146 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011147 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011148 }
11149 break;
11150 }
11151 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011152 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011153 }
11154
Victor Stinner770e19e2012-10-04 22:59:45 +020011155 if (len1 == len2)
11156 return 0;
11157 if (len1 < len2)
11158 return -1;
11159 else
11160 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011161
11162#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011163}
11164
Benjamin Peterson621b4302016-09-09 13:54:34 -070011165static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011166unicode_compare_eq(PyObject *str1, PyObject *str2)
11167{
11168 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011169 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011170 Py_ssize_t len;
11171 int cmp;
11172
Victor Stinnere5567ad2012-10-23 02:48:49 +020011173 len = PyUnicode_GET_LENGTH(str1);
11174 if (PyUnicode_GET_LENGTH(str2) != len)
11175 return 0;
11176 kind = PyUnicode_KIND(str1);
11177 if (PyUnicode_KIND(str2) != kind)
11178 return 0;
11179 data1 = PyUnicode_DATA(str1);
11180 data2 = PyUnicode_DATA(str2);
11181
11182 cmp = memcmp(data1, data2, len * kind);
11183 return (cmp == 0);
11184}
11185
11186
Alexander Belopolsky40018472011-02-26 01:02:56 +000011187int
11188PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011189{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011190 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11191 if (PyUnicode_READY(left) == -1 ||
11192 PyUnicode_READY(right) == -1)
11193 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011194
11195 /* a string is equal to itself */
11196 if (left == right)
11197 return 0;
11198
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011199 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011200 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011201 PyErr_Format(PyExc_TypeError,
11202 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011203 Py_TYPE(left)->tp_name,
11204 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011205 return -1;
11206}
11207
Martin v. Löwis5b222132007-06-10 09:51:05 +000011208int
11209PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11210{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011211 Py_ssize_t i;
11212 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011213 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011214 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011215
Victor Stinner910337b2011-10-03 03:20:16 +020011216 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011217 if (!PyUnicode_IS_READY(uni)) {
11218 const wchar_t *ws = _PyUnicode_WSTR(uni);
11219 /* Compare Unicode string and source character set string */
11220 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11221 if (chr != ustr[i])
11222 return (chr < ustr[i]) ? -1 : 1;
11223 }
11224 /* This check keeps Python strings that end in '\0' from comparing equal
11225 to C strings identical up to that point. */
11226 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11227 return 1; /* uni is longer */
11228 if (ustr[i])
11229 return -1; /* str is longer */
11230 return 0;
11231 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011232 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011233 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011234 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011235 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011236 size_t len, len2 = strlen(str);
11237 int cmp;
11238
11239 len = Py_MIN(len1, len2);
11240 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011241 if (cmp != 0) {
11242 if (cmp < 0)
11243 return -1;
11244 else
11245 return 1;
11246 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011247 if (len1 > len2)
11248 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011249 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011250 return -1; /* str is longer */
11251 return 0;
11252 }
11253 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011254 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011255 /* Compare Unicode string and source character set string */
11256 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011257 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011258 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11259 /* This check keeps Python strings that end in '\0' from comparing equal
11260 to C strings identical up to that point. */
11261 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11262 return 1; /* uni is longer */
11263 if (str[i])
11264 return -1; /* str is longer */
11265 return 0;
11266 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011267}
11268
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011269static int
11270non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11271{
11272 size_t i, len;
11273 const wchar_t *p;
11274 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11275 if (strlen(str) != len)
11276 return 0;
11277 p = _PyUnicode_WSTR(unicode);
11278 assert(p);
11279 for (i = 0; i < len; i++) {
11280 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011281 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011282 return 0;
11283 }
11284 return 1;
11285}
11286
11287int
11288_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11289{
11290 size_t len;
11291 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011292 assert(str);
11293#ifndef NDEBUG
11294 for (const char *p = str; *p; p++) {
11295 assert((unsigned char)*p < 128);
11296 }
11297#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011298 if (PyUnicode_READY(unicode) == -1) {
11299 /* Memory error or bad data */
11300 PyErr_Clear();
11301 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11302 }
11303 if (!PyUnicode_IS_ASCII(unicode))
11304 return 0;
11305 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11306 return strlen(str) == len &&
11307 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11308}
11309
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011310int
11311_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11312{
11313 PyObject *right_uni;
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011314
11315 assert(_PyUnicode_CHECK(left));
11316 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011317#ifndef NDEBUG
11318 for (const char *p = right->string; *p; p++) {
11319 assert((unsigned char)*p < 128);
11320 }
11321#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011322
11323 if (PyUnicode_READY(left) == -1) {
11324 /* memory error or bad data */
11325 PyErr_Clear();
11326 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11327 }
11328
11329 if (!PyUnicode_IS_ASCII(left))
11330 return 0;
11331
11332 right_uni = _PyUnicode_FromId(right); /* borrowed */
11333 if (right_uni == NULL) {
11334 /* memory error or bad data */
11335 PyErr_Clear();
11336 return _PyUnicode_EqualToASCIIString(left, right->string);
11337 }
11338
11339 if (left == right_uni)
11340 return 1;
11341
11342 if (PyUnicode_CHECK_INTERNED(left))
11343 return 0;
11344
Victor Stinner607b1022020-05-05 18:50:30 +020011345#ifdef INTERNED_STRINGS
INADA Naoki7cc95f52018-01-28 02:07:09 +090011346 assert(_PyUnicode_HASH(right_uni) != -1);
Victor Stinner607b1022020-05-05 18:50:30 +020011347 Py_hash_t hash = _PyUnicode_HASH(left);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011348 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11349 return 0;
Victor Stinner607b1022020-05-05 18:50:30 +020011350#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011351
11352 return unicode_compare_eq(left, right_uni);
11353}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011354
Alexander Belopolsky40018472011-02-26 01:02:56 +000011355PyObject *
11356PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011357{
11358 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011359
Victor Stinnere5567ad2012-10-23 02:48:49 +020011360 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11361 Py_RETURN_NOTIMPLEMENTED;
11362
11363 if (PyUnicode_READY(left) == -1 ||
11364 PyUnicode_READY(right) == -1)
11365 return NULL;
11366
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011367 if (left == right) {
11368 switch (op) {
11369 case Py_EQ:
11370 case Py_LE:
11371 case Py_GE:
11372 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011373 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011374 case Py_NE:
11375 case Py_LT:
11376 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011377 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011378 default:
11379 PyErr_BadArgument();
11380 return NULL;
11381 }
11382 }
11383 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011384 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011385 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011386 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011387 }
11388 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011389 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011390 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011391 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011392}
11393
Alexander Belopolsky40018472011-02-26 01:02:56 +000011394int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011395_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11396{
11397 return unicode_eq(aa, bb);
11398}
11399
11400int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011401PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011402{
Victor Stinner77282cb2013-04-14 19:22:47 +020011403 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011404 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011405 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011406 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011407
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011408 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011409 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011410 "'in <string>' requires string as left operand, not %.100s",
11411 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011412 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011413 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011414 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011415 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011416 if (ensure_unicode(str) < 0)
11417 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011418
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011419 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011420 kind2 = PyUnicode_KIND(substr);
11421 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011422 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011423 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011424 len2 = PyUnicode_GET_LENGTH(substr);
11425 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011426 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011427 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011428 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011429 if (len2 == 1) {
11430 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11431 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011432 return result;
11433 }
11434 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011435 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011436 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011437 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011438 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011439
Victor Stinner77282cb2013-04-14 19:22:47 +020011440 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011441 case PyUnicode_1BYTE_KIND:
11442 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11443 break;
11444 case PyUnicode_2BYTE_KIND:
11445 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11446 break;
11447 case PyUnicode_4BYTE_KIND:
11448 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11449 break;
11450 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011451 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011453
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011454 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011455 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011456 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011457
Guido van Rossum403d68b2000-03-13 15:55:09 +000011458 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011459}
11460
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461/* Concat to string or Unicode object giving a new Unicode object. */
11462
Alexander Belopolsky40018472011-02-26 01:02:56 +000011463PyObject *
11464PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011466 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011467 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011468 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011469
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011470 if (ensure_unicode(left) < 0)
11471 return NULL;
11472
11473 if (!PyUnicode_Check(right)) {
11474 PyErr_Format(PyExc_TypeError,
11475 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011476 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011477 return NULL;
11478 }
11479 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011480 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481
11482 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011483 if (left == unicode_empty)
11484 return PyUnicode_FromObject(right);
11485 if (right == unicode_empty)
11486 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011488 left_len = PyUnicode_GET_LENGTH(left);
11489 right_len = PyUnicode_GET_LENGTH(right);
11490 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011491 PyErr_SetString(PyExc_OverflowError,
11492 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011493 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011494 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011495 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011496
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011497 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11498 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011499 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011500
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011502 result = PyUnicode_New(new_len, maxchar);
11503 if (result == NULL)
11504 return NULL;
11505 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11506 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11507 assert(_PyUnicode_CheckConsistency(result, 1));
11508 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509}
11510
Walter Dörwald1ab83302007-05-18 17:15:44 +000011511void
Victor Stinner23e56682011-10-03 03:54:37 +020011512PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011513{
Victor Stinner23e56682011-10-03 03:54:37 +020011514 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011515 Py_UCS4 maxchar, maxchar2;
11516 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011517
11518 if (p_left == NULL) {
11519 if (!PyErr_Occurred())
11520 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011521 return;
11522 }
Victor Stinner23e56682011-10-03 03:54:37 +020011523 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011524 if (right == NULL || left == NULL
11525 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011526 if (!PyErr_Occurred())
11527 PyErr_BadInternalCall();
11528 goto error;
11529 }
11530
Benjamin Petersonbac79492012-01-14 13:34:47 -050011531 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011532 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011533 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011534 goto error;
11535
Victor Stinner488fa492011-12-12 00:01:39 +010011536 /* Shortcuts */
11537 if (left == unicode_empty) {
11538 Py_DECREF(left);
11539 Py_INCREF(right);
11540 *p_left = right;
11541 return;
11542 }
11543 if (right == unicode_empty)
11544 return;
11545
11546 left_len = PyUnicode_GET_LENGTH(left);
11547 right_len = PyUnicode_GET_LENGTH(right);
11548 if (left_len > PY_SSIZE_T_MAX - right_len) {
11549 PyErr_SetString(PyExc_OverflowError,
11550 "strings are too large to concat");
11551 goto error;
11552 }
11553 new_len = left_len + right_len;
11554
11555 if (unicode_modifiable(left)
11556 && PyUnicode_CheckExact(right)
11557 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011558 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11559 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011560 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011561 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011562 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11563 {
11564 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011565 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011566 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011567
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011568 /* copy 'right' into the newly allocated area of 'left' */
11569 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011570 }
Victor Stinner488fa492011-12-12 00:01:39 +010011571 else {
11572 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11573 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011574 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011575
Victor Stinner488fa492011-12-12 00:01:39 +010011576 /* Concat the two Unicode strings */
11577 res = PyUnicode_New(new_len, maxchar);
11578 if (res == NULL)
11579 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011580 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11581 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011582 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011583 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011584 }
11585 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011586 return;
11587
11588error:
Victor Stinner488fa492011-12-12 00:01:39 +010011589 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011590}
11591
11592void
11593PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11594{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011595 PyUnicode_Append(pleft, right);
11596 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011597}
11598
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011599/*
11600Wraps stringlib_parse_args_finds() and additionally ensures that the
11601first argument is a unicode object.
11602*/
11603
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011604static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011605parse_args_finds_unicode(const char * function_name, PyObject *args,
11606 PyObject **substring,
11607 Py_ssize_t *start, Py_ssize_t *end)
11608{
11609 if(stringlib_parse_args_finds(function_name, args, substring,
11610 start, end)) {
11611 if (ensure_unicode(*substring) < 0)
11612 return 0;
11613 return 1;
11614 }
11615 return 0;
11616}
11617
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011618PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011619 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011620\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011621Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011622string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011623interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624
11625static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011626unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011628 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011629 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011630 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011632 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011633 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011634 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011636 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011637 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011638
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011639 kind1 = PyUnicode_KIND(self);
11640 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011641 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011642 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011644 len1 = PyUnicode_GET_LENGTH(self);
11645 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011646 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011647 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011648 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011649
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011650 buf1 = PyUnicode_DATA(self);
11651 buf2 = PyUnicode_DATA(substring);
11652 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011653 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011654 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011655 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011656 }
11657 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011658 case PyUnicode_1BYTE_KIND:
11659 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011660 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011661 buf2, len2, PY_SSIZE_T_MAX
11662 );
11663 break;
11664 case PyUnicode_2BYTE_KIND:
11665 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011666 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011667 buf2, len2, PY_SSIZE_T_MAX
11668 );
11669 break;
11670 case PyUnicode_4BYTE_KIND:
11671 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011672 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011673 buf2, len2, PY_SSIZE_T_MAX
11674 );
11675 break;
11676 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011677 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011678 }
11679
11680 result = PyLong_FromSsize_t(iresult);
11681
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011682 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011683 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011684 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685
Guido van Rossumd57fd912000-03-10 22:53:23 +000011686 return result;
11687}
11688
INADA Naoki3ae20562017-01-16 20:41:20 +090011689/*[clinic input]
11690str.encode as unicode_encode
11691
11692 encoding: str(c_default="NULL") = 'utf-8'
11693 The encoding in which to encode the string.
11694 errors: str(c_default="NULL") = 'strict'
11695 The error handling scheme to use for encoding errors.
11696 The default is 'strict' meaning that encoding errors raise a
11697 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11698 'xmlcharrefreplace' as well as any other name registered with
11699 codecs.register_error that can handle UnicodeEncodeErrors.
11700
11701Encode the string using the codec registered for encoding.
11702[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703
11704static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011705unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011706/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011708 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011709}
11710
INADA Naoki3ae20562017-01-16 20:41:20 +090011711/*[clinic input]
11712str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713
INADA Naoki3ae20562017-01-16 20:41:20 +090011714 tabsize: int = 8
11715
11716Return a copy where all tab characters are expanded using spaces.
11717
11718If tabsize is not given, a tab size of 8 characters is assumed.
11719[clinic start generated code]*/
11720
11721static PyObject *
11722unicode_expandtabs_impl(PyObject *self, int tabsize)
11723/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011725 Py_ssize_t i, j, line_pos, src_len, incr;
11726 Py_UCS4 ch;
11727 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011728 const void *src_data;
11729 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011730 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011731 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732
Antoine Pitrou22425222011-10-04 19:10:51 +020011733 if (PyUnicode_READY(self) == -1)
11734 return NULL;
11735
Thomas Wouters7e474022000-07-16 12:04:32 +000011736 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011737 src_len = PyUnicode_GET_LENGTH(self);
11738 i = j = line_pos = 0;
11739 kind = PyUnicode_KIND(self);
11740 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011741 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011742 for (; i < src_len; i++) {
11743 ch = PyUnicode_READ(kind, src_data, i);
11744 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011745 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011746 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011747 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011748 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011749 goto overflow;
11750 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011751 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011752 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011753 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011754 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011755 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011756 goto overflow;
11757 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011759 if (ch == '\n' || ch == '\r')
11760 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011762 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011763 if (!found)
11764 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011765
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011767 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768 if (!u)
11769 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011770 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771
Antoine Pitroue71d5742011-10-04 15:55:09 +020011772 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773
Antoine Pitroue71d5742011-10-04 15:55:09 +020011774 for (; i < src_len; i++) {
11775 ch = PyUnicode_READ(kind, src_data, i);
11776 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011777 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011778 incr = tabsize - (line_pos % tabsize);
11779 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011780 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011781 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011782 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011783 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011784 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011785 line_pos++;
11786 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011787 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011788 if (ch == '\n' || ch == '\r')
11789 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011791 }
11792 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011793 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011794
Antoine Pitroue71d5742011-10-04 15:55:09 +020011795 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011796 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11797 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798}
11799
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011800PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011801 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802\n\
11803Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011804such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805arguments start and end are interpreted as in slice notation.\n\
11806\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011807Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808
11809static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011812 /* initialize variables to prevent gcc warning */
11813 PyObject *substring = NULL;
11814 Py_ssize_t start = 0;
11815 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011816 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011817
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011818 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011820
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011821 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011822 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011823
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011824 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011826 if (result == -2)
11827 return NULL;
11828
Christian Heimes217cfd12007-12-02 14:31:20 +000011829 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830}
11831
11832static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011833unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011835 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011836 enum PyUnicode_Kind kind;
11837 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011838
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011839 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011840 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011841 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011842 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011843 if (PyUnicode_READY(self) == -1) {
11844 return NULL;
11845 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011846 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11847 PyErr_SetString(PyExc_IndexError, "string index out of range");
11848 return NULL;
11849 }
11850 kind = PyUnicode_KIND(self);
11851 data = PyUnicode_DATA(self);
11852 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011853 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854}
11855
Guido van Rossumc2504932007-09-18 19:42:40 +000011856/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011857 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011858static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011859unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011860{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011861 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011862
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011863#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011864 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011865#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011866 if (_PyUnicode_HASH(self) != -1)
11867 return _PyUnicode_HASH(self);
11868 if (PyUnicode_READY(self) == -1)
11869 return -1;
animalizea1d14252019-01-02 20:16:06 +080011870
Christian Heimes985ecdc2013-11-20 11:46:18 +010011871 x = _Py_HashBytes(PyUnicode_DATA(self),
11872 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011873 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011874 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011875}
11876
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011877PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011878 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011879\n\
oldkaa0735f2018-02-02 16:52:55 +080011880Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011881such that sub is contained within S[start:end]. Optional\n\
11882arguments start and end are interpreted as in slice notation.\n\
11883\n\
11884Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885
11886static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011887unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011889 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011890 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011891 PyObject *substring = NULL;
11892 Py_ssize_t start = 0;
11893 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011895 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011898 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011900
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011901 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011903 if (result == -2)
11904 return NULL;
11905
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906 if (result < 0) {
11907 PyErr_SetString(PyExc_ValueError, "substring not found");
11908 return NULL;
11909 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011910
Christian Heimes217cfd12007-12-02 14:31:20 +000011911 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912}
11913
INADA Naoki3ae20562017-01-16 20:41:20 +090011914/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011915str.isascii as unicode_isascii
11916
11917Return True if all characters in the string are ASCII, False otherwise.
11918
11919ASCII characters have code points in the range U+0000-U+007F.
11920Empty string is ASCII too.
11921[clinic start generated code]*/
11922
11923static PyObject *
11924unicode_isascii_impl(PyObject *self)
11925/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11926{
11927 if (PyUnicode_READY(self) == -1) {
11928 return NULL;
11929 }
11930 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11931}
11932
11933/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011934str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011935
INADA Naoki3ae20562017-01-16 20:41:20 +090011936Return True if the string is a lowercase string, False otherwise.
11937
11938A string is lowercase if all cased characters in the string are lowercase and
11939there is at least one cased character in the string.
11940[clinic start generated code]*/
11941
11942static PyObject *
11943unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011944/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011945{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011946 Py_ssize_t i, length;
11947 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011948 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949 int cased;
11950
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011951 if (PyUnicode_READY(self) == -1)
11952 return NULL;
11953 length = PyUnicode_GET_LENGTH(self);
11954 kind = PyUnicode_KIND(self);
11955 data = PyUnicode_DATA(self);
11956
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011958 if (length == 1)
11959 return PyBool_FromLong(
11960 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011961
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011962 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011963 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011964 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011965
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967 for (i = 0; i < length; i++) {
11968 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011969
Benjamin Peterson29060642009-01-31 22:14:21 +000011970 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011971 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011972 else if (!cased && Py_UNICODE_ISLOWER(ch))
11973 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011975 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011976}
11977
INADA Naoki3ae20562017-01-16 20:41:20 +090011978/*[clinic input]
11979str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011980
INADA Naoki3ae20562017-01-16 20:41:20 +090011981Return True if the string is an uppercase string, False otherwise.
11982
11983A string is uppercase if all cased characters in the string are uppercase and
11984there is at least one cased character in the string.
11985[clinic start generated code]*/
11986
11987static PyObject *
11988unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011989/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011990{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 Py_ssize_t i, length;
11992 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011993 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994 int cased;
11995
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 if (PyUnicode_READY(self) == -1)
11997 return NULL;
11998 length = PyUnicode_GET_LENGTH(self);
11999 kind = PyUnicode_KIND(self);
12000 data = PyUnicode_DATA(self);
12001
Guido van Rossumd57fd912000-03-10 22:53:23 +000012002 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003 if (length == 1)
12004 return PyBool_FromLong(
12005 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012007 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012009 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012010
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 for (i = 0; i < length; i++) {
12013 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012014
Benjamin Peterson29060642009-01-31 22:14:21 +000012015 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012016 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012017 else if (!cased && Py_UNICODE_ISUPPER(ch))
12018 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012019 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012020 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012021}
12022
INADA Naoki3ae20562017-01-16 20:41:20 +090012023/*[clinic input]
12024str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000012025
INADA Naoki3ae20562017-01-16 20:41:20 +090012026Return True if the string is a title-cased string, False otherwise.
12027
12028In a title-cased string, upper- and title-case characters may only
12029follow uncased characters and lowercase characters only cased ones.
12030[clinic start generated code]*/
12031
12032static PyObject *
12033unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012034/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012035{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036 Py_ssize_t i, length;
12037 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012038 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012039 int cased, previous_is_cased;
12040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012041 if (PyUnicode_READY(self) == -1)
12042 return NULL;
12043 length = PyUnicode_GET_LENGTH(self);
12044 kind = PyUnicode_KIND(self);
12045 data = PyUnicode_DATA(self);
12046
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 if (length == 1) {
12049 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12050 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12051 (Py_UNICODE_ISUPPER(ch) != 0));
12052 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012053
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012054 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012055 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012056 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012057
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058 cased = 0;
12059 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060 for (i = 0; i < length; i++) {
12061 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012062
Benjamin Peterson29060642009-01-31 22:14:21 +000012063 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12064 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012065 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012066 previous_is_cased = 1;
12067 cased = 1;
12068 }
12069 else if (Py_UNICODE_ISLOWER(ch)) {
12070 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012071 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012072 previous_is_cased = 1;
12073 cased = 1;
12074 }
12075 else
12076 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012078 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079}
12080
INADA Naoki3ae20562017-01-16 20:41:20 +090012081/*[clinic input]
12082str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083
INADA Naoki3ae20562017-01-16 20:41:20 +090012084Return True if the string is a whitespace string, False otherwise.
12085
12086A string is whitespace if all characters in the string are whitespace and there
12087is at least one character in the string.
12088[clinic start generated code]*/
12089
12090static PyObject *
12091unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012092/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012093{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012094 Py_ssize_t i, length;
12095 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012096 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012097
12098 if (PyUnicode_READY(self) == -1)
12099 return NULL;
12100 length = PyUnicode_GET_LENGTH(self);
12101 kind = PyUnicode_KIND(self);
12102 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012103
Guido van Rossumd57fd912000-03-10 22:53:23 +000012104 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012105 if (length == 1)
12106 return PyBool_FromLong(
12107 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012109 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012110 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012111 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012113 for (i = 0; i < length; i++) {
12114 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012115 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012116 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012118 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012119}
12120
INADA Naoki3ae20562017-01-16 20:41:20 +090012121/*[clinic input]
12122str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012123
INADA Naoki3ae20562017-01-16 20:41:20 +090012124Return True if the string is an alphabetic string, False otherwise.
12125
12126A string is alphabetic if all characters in the string are alphabetic and there
12127is at least one character in the string.
12128[clinic start generated code]*/
12129
12130static PyObject *
12131unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012132/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012133{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 Py_ssize_t i, length;
12135 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012136 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012137
12138 if (PyUnicode_READY(self) == -1)
12139 return NULL;
12140 length = PyUnicode_GET_LENGTH(self);
12141 kind = PyUnicode_KIND(self);
12142 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012143
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012144 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 if (length == 1)
12146 return PyBool_FromLong(
12147 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012148
12149 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012151 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012153 for (i = 0; i < length; i++) {
12154 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012155 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012156 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012157 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012158}
12159
INADA Naoki3ae20562017-01-16 20:41:20 +090012160/*[clinic input]
12161str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012162
INADA Naoki3ae20562017-01-16 20:41:20 +090012163Return True if the string is an alpha-numeric string, False otherwise.
12164
12165A string is alpha-numeric if all characters in the string are alpha-numeric and
12166there is at least one character in the string.
12167[clinic start generated code]*/
12168
12169static PyObject *
12170unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012171/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012172{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012173 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012174 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012175 Py_ssize_t len, i;
12176
12177 if (PyUnicode_READY(self) == -1)
12178 return NULL;
12179
12180 kind = PyUnicode_KIND(self);
12181 data = PyUnicode_DATA(self);
12182 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012183
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012184 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012185 if (len == 1) {
12186 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12187 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12188 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012189
12190 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012191 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012192 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012193
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194 for (i = 0; i < len; i++) {
12195 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012196 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012197 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012198 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012199 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012200}
12201
INADA Naoki3ae20562017-01-16 20:41:20 +090012202/*[clinic input]
12203str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012204
INADA Naoki3ae20562017-01-16 20:41:20 +090012205Return True if the string is a decimal string, False otherwise.
12206
12207A string is a decimal string if all characters in the string are decimal and
12208there is at least one character in the string.
12209[clinic start generated code]*/
12210
12211static PyObject *
12212unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012213/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012214{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012215 Py_ssize_t i, length;
12216 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012217 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012218
12219 if (PyUnicode_READY(self) == -1)
12220 return NULL;
12221 length = PyUnicode_GET_LENGTH(self);
12222 kind = PyUnicode_KIND(self);
12223 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012224
Guido van Rossumd57fd912000-03-10 22:53:23 +000012225 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012226 if (length == 1)
12227 return PyBool_FromLong(
12228 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012229
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012230 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012231 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012232 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012234 for (i = 0; i < length; i++) {
12235 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012236 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012238 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012239}
12240
INADA Naoki3ae20562017-01-16 20:41:20 +090012241/*[clinic input]
12242str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012243
INADA Naoki3ae20562017-01-16 20:41:20 +090012244Return True if the string is a digit string, False otherwise.
12245
12246A string is a digit string if all characters in the string are digits and there
12247is at least one character in the string.
12248[clinic start generated code]*/
12249
12250static PyObject *
12251unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012252/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012253{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012254 Py_ssize_t i, length;
12255 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012256 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012257
12258 if (PyUnicode_READY(self) == -1)
12259 return NULL;
12260 length = PyUnicode_GET_LENGTH(self);
12261 kind = PyUnicode_KIND(self);
12262 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263
Guido van Rossumd57fd912000-03-10 22:53:23 +000012264 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012265 if (length == 1) {
12266 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12267 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12268 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012269
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012270 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012271 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012272 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012273
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012274 for (i = 0; i < length; i++) {
12275 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012276 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012277 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012278 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279}
12280
INADA Naoki3ae20562017-01-16 20:41:20 +090012281/*[clinic input]
12282str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283
INADA Naoki3ae20562017-01-16 20:41:20 +090012284Return True if the string is a numeric string, False otherwise.
12285
12286A string is numeric if all characters in the string are numeric and there is at
12287least one character in the string.
12288[clinic start generated code]*/
12289
12290static PyObject *
12291unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012292/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012293{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012294 Py_ssize_t i, length;
12295 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012296 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297
12298 if (PyUnicode_READY(self) == -1)
12299 return NULL;
12300 length = PyUnicode_GET_LENGTH(self);
12301 kind = PyUnicode_KIND(self);
12302 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012303
Guido van Rossumd57fd912000-03-10 22:53:23 +000012304 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012305 if (length == 1)
12306 return PyBool_FromLong(
12307 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012308
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012309 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012311 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012313 for (i = 0; i < length; i++) {
12314 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012315 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012316 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012317 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012318}
12319
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012320Py_ssize_t
12321_PyUnicode_ScanIdentifier(PyObject *self)
Martin v. Löwis47383402007-08-15 07:32:56 +000012322{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323 Py_ssize_t i;
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012324 if (PyUnicode_READY(self) == -1)
12325 return -1;
Martin v. Löwis47383402007-08-15 07:32:56 +000012326
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012327 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012328 if (len == 0) {
12329 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012330 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 }
12332
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012333 int kind = PyUnicode_KIND(self);
12334 const void *data = PyUnicode_DATA(self);
12335 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Martin v. Löwis47383402007-08-15 07:32:56 +000012336 /* PEP 3131 says that the first character must be in
12337 XID_Start and subsequent characters in XID_Continue,
12338 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012339 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012340 letters, digits, underscore). However, given the current
12341 definition of XID_Start and XID_Continue, it is sufficient
12342 to check just for these, except that _ must be allowed
12343 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012344 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012345 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012346 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012347
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012348 for (i = 1; i < len; i++) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012349 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012350 if (!_PyUnicode_IsXidContinue(ch)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012351 return i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012352 }
12353 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012354 return i;
12355}
12356
12357int
12358PyUnicode_IsIdentifier(PyObject *self)
12359{
12360 if (PyUnicode_IS_READY(self)) {
12361 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12362 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12363 /* an empty string is not a valid identifier */
12364 return len && i == len;
12365 }
12366 else {
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012367 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012368 if (len == 0) {
12369 /* an empty string is not a valid identifier */
12370 return 0;
12371 }
12372
12373 const wchar_t *wstr = _PyUnicode_WSTR(self);
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012374 Py_UCS4 ch = wstr[i++];
12375#if SIZEOF_WCHAR_T == 2
12376 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12377 && i < len
12378 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12379 {
12380 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12381 i++;
12382 }
12383#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012384 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12385 return 0;
12386 }
12387
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012388 while (i < len) {
12389 ch = wstr[i++];
12390#if SIZEOF_WCHAR_T == 2
12391 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12392 && i < len
12393 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12394 {
12395 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12396 i++;
12397 }
12398#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012399 if (!_PyUnicode_IsXidContinue(ch)) {
12400 return 0;
12401 }
12402 }
12403 return 1;
12404 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012405}
12406
INADA Naoki3ae20562017-01-16 20:41:20 +090012407/*[clinic input]
12408str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012409
INADA Naoki3ae20562017-01-16 20:41:20 +090012410Return True if the string is a valid Python identifier, False otherwise.
12411
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012412Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012413such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012414[clinic start generated code]*/
12415
12416static PyObject *
12417unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012418/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012419{
12420 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12421}
12422
INADA Naoki3ae20562017-01-16 20:41:20 +090012423/*[clinic input]
12424str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012425
INADA Naoki3ae20562017-01-16 20:41:20 +090012426Return True if the string is printable, False otherwise.
12427
12428A string is printable if all of its characters are considered printable in
12429repr() or if it is empty.
12430[clinic start generated code]*/
12431
12432static PyObject *
12433unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012434/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012435{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012436 Py_ssize_t i, length;
12437 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012438 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012439
12440 if (PyUnicode_READY(self) == -1)
12441 return NULL;
12442 length = PyUnicode_GET_LENGTH(self);
12443 kind = PyUnicode_KIND(self);
12444 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012445
12446 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012447 if (length == 1)
12448 return PyBool_FromLong(
12449 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012451 for (i = 0; i < length; i++) {
12452 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012453 Py_RETURN_FALSE;
12454 }
12455 }
12456 Py_RETURN_TRUE;
12457}
12458
INADA Naoki3ae20562017-01-16 20:41:20 +090012459/*[clinic input]
12460str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012461
INADA Naoki3ae20562017-01-16 20:41:20 +090012462 iterable: object
12463 /
12464
12465Concatenate any number of strings.
12466
Martin Panter91a88662017-01-24 00:30:06 +000012467The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012468The result is returned as a new string.
12469
12470Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12471[clinic start generated code]*/
12472
12473static PyObject *
12474unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012475/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012476{
INADA Naoki3ae20562017-01-16 20:41:20 +090012477 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012478}
12479
Martin v. Löwis18e16552006-02-15 17:27:45 +000012480static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012481unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012482{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012483 if (PyUnicode_READY(self) == -1)
12484 return -1;
12485 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012486}
12487
INADA Naoki3ae20562017-01-16 20:41:20 +090012488/*[clinic input]
12489str.ljust as unicode_ljust
12490
12491 width: Py_ssize_t
12492 fillchar: Py_UCS4 = ' '
12493 /
12494
12495Return a left-justified string of length width.
12496
12497Padding is done using the specified fill character (default is a space).
12498[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012499
12500static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012501unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12502/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012504 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012505 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012506
Victor Stinnerc4b49542011-12-11 22:44:26 +010012507 if (PyUnicode_GET_LENGTH(self) >= width)
12508 return unicode_result_unchanged(self);
12509
12510 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012511}
12512
INADA Naoki3ae20562017-01-16 20:41:20 +090012513/*[clinic input]
12514str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515
INADA Naoki3ae20562017-01-16 20:41:20 +090012516Return a copy of the string converted to lowercase.
12517[clinic start generated code]*/
12518
12519static PyObject *
12520unicode_lower_impl(PyObject *self)
12521/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012523 if (PyUnicode_READY(self) == -1)
12524 return NULL;
12525 if (PyUnicode_IS_ASCII(self))
12526 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012527 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012528}
12529
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012530#define LEFTSTRIP 0
12531#define RIGHTSTRIP 1
12532#define BOTHSTRIP 2
12533
12534/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012535static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012536
INADA Naoki3ae20562017-01-16 20:41:20 +090012537#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012538
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012539/* externally visible for str.strip(unicode) */
12540PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012541_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012542{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012543 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 int kind;
12545 Py_ssize_t i, j, len;
12546 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012547 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012548
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012549 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12550 return NULL;
12551
12552 kind = PyUnicode_KIND(self);
12553 data = PyUnicode_DATA(self);
12554 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012555 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012556 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12557 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012558 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012559
Benjamin Peterson14339b62009-01-31 16:36:08 +000012560 i = 0;
12561 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012562 while (i < len) {
12563 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12564 if (!BLOOM(sepmask, ch))
12565 break;
12566 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12567 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012568 i++;
12569 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012570 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012571
Benjamin Peterson14339b62009-01-31 16:36:08 +000012572 j = len;
12573 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012574 j--;
12575 while (j >= i) {
12576 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12577 if (!BLOOM(sepmask, ch))
12578 break;
12579 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12580 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012581 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012582 }
12583
Benjamin Peterson29060642009-01-31 22:14:21 +000012584 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012585 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012586
Victor Stinner7931d9a2011-11-04 00:22:48 +010012587 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012588}
12589
12590PyObject*
12591PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12592{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012593 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012594 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012595 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012596
Victor Stinnerde636f32011-10-01 03:55:54 +020012597 if (PyUnicode_READY(self) == -1)
12598 return NULL;
12599
Victor Stinner684d5fd2012-05-03 02:32:34 +020012600 length = PyUnicode_GET_LENGTH(self);
12601 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012602
Victor Stinner684d5fd2012-05-03 02:32:34 +020012603 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012604 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012605
Victor Stinnerde636f32011-10-01 03:55:54 +020012606 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012607 PyErr_SetString(PyExc_IndexError, "string index out of range");
12608 return NULL;
12609 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012610 if (start >= length || end < start)
12611 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012612
Victor Stinner684d5fd2012-05-03 02:32:34 +020012613 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012614 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012615 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012616 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012617 }
12618 else {
12619 kind = PyUnicode_KIND(self);
12620 data = PyUnicode_1BYTE_DATA(self);
12621 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012622 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012623 length);
12624 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012625}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012626
12627static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012628do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012629{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012630 Py_ssize_t len, i, j;
12631
12632 if (PyUnicode_READY(self) == -1)
12633 return NULL;
12634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012635 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012636
Victor Stinnercc7af722013-04-09 22:39:24 +020012637 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012638 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012639
12640 i = 0;
12641 if (striptype != RIGHTSTRIP) {
12642 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012643 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012644 if (!_Py_ascii_whitespace[ch])
12645 break;
12646 i++;
12647 }
12648 }
12649
12650 j = len;
12651 if (striptype != LEFTSTRIP) {
12652 j--;
12653 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012654 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012655 if (!_Py_ascii_whitespace[ch])
12656 break;
12657 j--;
12658 }
12659 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012660 }
12661 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012662 else {
12663 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012664 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012665
Victor Stinnercc7af722013-04-09 22:39:24 +020012666 i = 0;
12667 if (striptype != RIGHTSTRIP) {
12668 while (i < len) {
12669 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12670 if (!Py_UNICODE_ISSPACE(ch))
12671 break;
12672 i++;
12673 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012674 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012675
12676 j = len;
12677 if (striptype != LEFTSTRIP) {
12678 j--;
12679 while (j >= i) {
12680 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12681 if (!Py_UNICODE_ISSPACE(ch))
12682 break;
12683 j--;
12684 }
12685 j++;
12686 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012687 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012688
Victor Stinner7931d9a2011-11-04 00:22:48 +010012689 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690}
12691
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012692
12693static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012694do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012695{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012696 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012697 if (PyUnicode_Check(sep))
12698 return _PyUnicode_XStrip(self, striptype, sep);
12699 else {
12700 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012701 "%s arg must be None or str",
12702 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012703 return NULL;
12704 }
12705 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012706
Benjamin Peterson14339b62009-01-31 16:36:08 +000012707 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012708}
12709
12710
INADA Naoki3ae20562017-01-16 20:41:20 +090012711/*[clinic input]
12712str.strip as unicode_strip
12713
12714 chars: object = None
12715 /
12716
Zachary Ware09895c22019-10-09 16:09:00 -050012717Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012718
12719If chars is given and not None, remove characters in chars instead.
12720[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012721
12722static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012723unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012724/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012725{
INADA Naoki3ae20562017-01-16 20:41:20 +090012726 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012727}
12728
12729
INADA Naoki3ae20562017-01-16 20:41:20 +090012730/*[clinic input]
12731str.lstrip as unicode_lstrip
12732
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012733 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012734 /
12735
12736Return a copy of the string with leading whitespace removed.
12737
12738If chars is given and not None, remove characters in chars instead.
12739[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012740
12741static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012742unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012743/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012744{
INADA Naoki3ae20562017-01-16 20:41:20 +090012745 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012746}
12747
12748
INADA Naoki3ae20562017-01-16 20:41:20 +090012749/*[clinic input]
12750str.rstrip as unicode_rstrip
12751
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012752 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012753 /
12754
12755Return a copy of the string with trailing whitespace removed.
12756
12757If chars is given and not None, remove characters in chars instead.
12758[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012759
12760static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012761unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012762/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012763{
INADA Naoki3ae20562017-01-16 20:41:20 +090012764 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012765}
12766
12767
Guido van Rossumd57fd912000-03-10 22:53:23 +000012768static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012769unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012770{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012771 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012772 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012773
Serhiy Storchaka05997252013-01-26 12:14:02 +020012774 if (len < 1)
12775 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012776
Victor Stinnerc4b49542011-12-11 22:44:26 +010012777 /* no repeat, return original string */
12778 if (len == 1)
12779 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012780
Benjamin Petersonbac79492012-01-14 13:34:47 -050012781 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012782 return NULL;
12783
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012784 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012785 PyErr_SetString(PyExc_OverflowError,
12786 "repeated string is too long");
12787 return NULL;
12788 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012789 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012790
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012791 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012792 if (!u)
12793 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012794 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012796 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012797 int kind = PyUnicode_KIND(str);
12798 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012799 if (kind == PyUnicode_1BYTE_KIND) {
12800 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012801 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012802 }
12803 else if (kind == PyUnicode_2BYTE_KIND) {
12804 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012805 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012806 ucs2[n] = fill_char;
12807 } else {
12808 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12809 assert(kind == PyUnicode_4BYTE_KIND);
12810 for (n = 0; n < len; ++n)
12811 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012812 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012813 }
12814 else {
12815 /* number of characters copied this far */
12816 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012817 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012818 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012819 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012820 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012821 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012822 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012823 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012824 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012825 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012826 }
12827
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012828 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012829 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012830}
12831
Alexander Belopolsky40018472011-02-26 01:02:56 +000012832PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012833PyUnicode_Replace(PyObject *str,
12834 PyObject *substr,
12835 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012836 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012837{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012838 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12839 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012840 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012841 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012842}
12843
INADA Naoki3ae20562017-01-16 20:41:20 +090012844/*[clinic input]
12845str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012846
INADA Naoki3ae20562017-01-16 20:41:20 +090012847 old: unicode
12848 new: unicode
12849 count: Py_ssize_t = -1
12850 Maximum number of occurrences to replace.
12851 -1 (the default value) means replace all occurrences.
12852 /
12853
12854Return a copy with all occurrences of substring old replaced by new.
12855
12856If the optional argument count is given, only the first count occurrences are
12857replaced.
12858[clinic start generated code]*/
12859
12860static PyObject *
12861unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12862 Py_ssize_t count)
12863/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012864{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012865 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012866 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012867 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012868}
12869
sweeneydea81849b2020-04-22 17:05:48 -040012870/*[clinic input]
12871str.removeprefix as unicode_removeprefix
12872
12873 prefix: unicode
12874 /
12875
12876Return a str with the given prefix string removed if present.
12877
12878If the string starts with the prefix string, return string[len(prefix):].
12879Otherwise, return a copy of the original string.
12880[clinic start generated code]*/
12881
12882static PyObject *
12883unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12884/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
12885{
12886 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12887 if (match == -1) {
12888 return NULL;
12889 }
12890 if (match) {
12891 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12892 PyUnicode_GET_LENGTH(self));
12893 }
12894 return unicode_result_unchanged(self);
12895}
12896
12897/*[clinic input]
12898str.removesuffix as unicode_removesuffix
12899
12900 suffix: unicode
12901 /
12902
12903Return a str with the given suffix string removed if present.
12904
12905If the string ends with the suffix string and that suffix is not empty,
12906return string[:-len(suffix)]. Otherwise, return a copy of the original
12907string.
12908[clinic start generated code]*/
12909
12910static PyObject *
12911unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12912/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12913{
12914 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12915 if (match == -1) {
12916 return NULL;
12917 }
12918 if (match) {
12919 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12920 - PyUnicode_GET_LENGTH(suffix));
12921 }
12922 return unicode_result_unchanged(self);
12923}
12924
Alexander Belopolsky40018472011-02-26 01:02:56 +000012925static PyObject *
12926unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012927{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012928 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012929 Py_ssize_t isize;
12930 Py_ssize_t osize, squote, dquote, i, o;
12931 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012932 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012933 const void *idata;
12934 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012936 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012937 return NULL;
12938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012939 isize = PyUnicode_GET_LENGTH(unicode);
12940 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012942 /* Compute length of output, quote characters, and
12943 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012944 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012945 max = 127;
12946 squote = dquote = 0;
12947 ikind = PyUnicode_KIND(unicode);
12948 for (i = 0; i < isize; i++) {
12949 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012950 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012951 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012952 case '\'': squote++; break;
12953 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012954 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012955 incr = 2;
12956 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012957 default:
12958 /* Fast-path ASCII */
12959 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012960 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012961 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012962 ;
12963 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012964 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012965 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012966 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012967 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012968 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012969 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012970 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012971 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012972 if (osize > PY_SSIZE_T_MAX - incr) {
12973 PyErr_SetString(PyExc_OverflowError,
12974 "string is too long to generate repr");
12975 return NULL;
12976 }
12977 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012978 }
12979
12980 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012981 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012982 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012983 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012984 if (dquote)
12985 /* Both squote and dquote present. Use squote,
12986 and escape them */
12987 osize += squote;
12988 else
12989 quote = '"';
12990 }
Victor Stinner55c08782013-04-14 18:45:39 +020012991 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012992
12993 repr = PyUnicode_New(osize, max);
12994 if (repr == NULL)
12995 return NULL;
12996 okind = PyUnicode_KIND(repr);
12997 odata = PyUnicode_DATA(repr);
12998
12999 PyUnicode_WRITE(okind, odata, 0, quote);
13000 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020013001 if (unchanged) {
13002 _PyUnicode_FastCopyCharacters(repr, 1,
13003 unicode, 0,
13004 isize);
13005 }
13006 else {
13007 for (i = 0, o = 1; i < isize; i++) {
13008 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013009
Victor Stinner55c08782013-04-14 18:45:39 +020013010 /* Escape quotes and backslashes */
13011 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000013012 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013013 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020013014 continue;
13015 }
13016
13017 /* Map special whitespace to '\t', \n', '\r' */
13018 if (ch == '\t') {
13019 PyUnicode_WRITE(okind, odata, o++, '\\');
13020 PyUnicode_WRITE(okind, odata, o++, 't');
13021 }
13022 else if (ch == '\n') {
13023 PyUnicode_WRITE(okind, odata, o++, '\\');
13024 PyUnicode_WRITE(okind, odata, o++, 'n');
13025 }
13026 else if (ch == '\r') {
13027 PyUnicode_WRITE(okind, odata, o++, '\\');
13028 PyUnicode_WRITE(okind, odata, o++, 'r');
13029 }
13030
13031 /* Map non-printable US ASCII to '\xhh' */
13032 else if (ch < ' ' || ch == 0x7F) {
13033 PyUnicode_WRITE(okind, odata, o++, '\\');
13034 PyUnicode_WRITE(okind, odata, o++, 'x');
13035 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13036 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13037 }
13038
13039 /* Copy ASCII characters as-is */
13040 else if (ch < 0x7F) {
13041 PyUnicode_WRITE(okind, odata, o++, ch);
13042 }
13043
13044 /* Non-ASCII characters */
13045 else {
13046 /* Map Unicode whitespace and control characters
13047 (categories Z* and C* except ASCII space)
13048 */
13049 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13050 PyUnicode_WRITE(okind, odata, o++, '\\');
13051 /* Map 8-bit characters to '\xhh' */
13052 if (ch <= 0xff) {
13053 PyUnicode_WRITE(okind, odata, o++, 'x');
13054 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13055 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13056 }
13057 /* Map 16-bit characters to '\uxxxx' */
13058 else if (ch <= 0xffff) {
13059 PyUnicode_WRITE(okind, odata, o++, 'u');
13060 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13061 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13062 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13063 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13064 }
13065 /* Map 21-bit characters to '\U00xxxxxx' */
13066 else {
13067 PyUnicode_WRITE(okind, odata, o++, 'U');
13068 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13069 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13070 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13071 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13072 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13073 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13074 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13075 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13076 }
13077 }
13078 /* Copy characters as-is */
13079 else {
13080 PyUnicode_WRITE(okind, odata, o++, ch);
13081 }
Georg Brandl559e5d72008-06-11 18:37:52 +000013082 }
13083 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000013084 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013085 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020013086 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000013087 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013088}
13089
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013090PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013091 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013092\n\
13093Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080013094such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013095arguments start and end are interpreted as in slice notation.\n\
13096\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013097Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013098
13099static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013100unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013101{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013102 /* initialize variables to prevent gcc warning */
13103 PyObject *substring = NULL;
13104 Py_ssize_t start = 0;
13105 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013106 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013107
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013108 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013109 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013111 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013112 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013113
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013114 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013116 if (result == -2)
13117 return NULL;
13118
Christian Heimes217cfd12007-12-02 14:31:20 +000013119 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013120}
13121
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013122PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013123 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070013125Return the highest index in S where substring sub is found,\n\
13126such that sub is contained within S[start:end]. Optional\n\
13127arguments start and end are interpreted as in slice notation.\n\
13128\n\
13129Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013130
13131static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013132unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013134 /* initialize variables to prevent gcc warning */
13135 PyObject *substring = NULL;
13136 Py_ssize_t start = 0;
13137 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013138 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013139
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013140 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013141 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013142
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013143 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013144 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013145
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013146 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013148 if (result == -2)
13149 return NULL;
13150
Guido van Rossumd57fd912000-03-10 22:53:23 +000013151 if (result < 0) {
13152 PyErr_SetString(PyExc_ValueError, "substring not found");
13153 return NULL;
13154 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013155
Christian Heimes217cfd12007-12-02 14:31:20 +000013156 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013157}
13158
INADA Naoki3ae20562017-01-16 20:41:20 +090013159/*[clinic input]
13160str.rjust as unicode_rjust
13161
13162 width: Py_ssize_t
13163 fillchar: Py_UCS4 = ' '
13164 /
13165
13166Return a right-justified string of length width.
13167
13168Padding is done using the specified fill character (default is a space).
13169[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013170
13171static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013172unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13173/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013174{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013175 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013176 return NULL;
13177
Victor Stinnerc4b49542011-12-11 22:44:26 +010013178 if (PyUnicode_GET_LENGTH(self) >= width)
13179 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013180
Victor Stinnerc4b49542011-12-11 22:44:26 +010013181 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013182}
13183
Alexander Belopolsky40018472011-02-26 01:02:56 +000013184PyObject *
13185PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013186{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013187 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013188 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013189
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013190 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013191}
13192
INADA Naoki3ae20562017-01-16 20:41:20 +090013193/*[clinic input]
13194str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013195
INADA Naoki3ae20562017-01-16 20:41:20 +090013196 sep: object = None
13197 The delimiter according which to split the string.
13198 None (the default value) means split according to any whitespace,
13199 and discard empty strings from the result.
13200 maxsplit: Py_ssize_t = -1
13201 Maximum number of splits to do.
13202 -1 (the default value) means no limit.
13203
13204Return a list of the words in the string, using sep as the delimiter string.
13205[clinic start generated code]*/
13206
13207static PyObject *
13208unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13209/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013210{
INADA Naoki3ae20562017-01-16 20:41:20 +090013211 if (sep == Py_None)
13212 return split(self, NULL, maxsplit);
13213 if (PyUnicode_Check(sep))
13214 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013215
Victor Stinner998b8062018-09-12 00:23:25 +020013216 PyErr_Format(PyExc_TypeError,
13217 "must be str or None, not %.100s",
13218 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013219 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013220}
13221
Thomas Wouters477c8d52006-05-27 19:21:47 +000013222PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013223PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013224{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013225 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013226 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013227 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013228 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013229
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013230 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013231 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013232
Victor Stinner14f8f022011-10-05 20:58:25 +020013233 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013234 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013235 len1 = PyUnicode_GET_LENGTH(str_obj);
13236 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013237 if (kind1 < kind2 || len1 < len2) {
13238 _Py_INCREF_UNICODE_EMPTY();
13239 if (!unicode_empty)
13240 out = NULL;
13241 else {
13242 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
13243 Py_DECREF(unicode_empty);
13244 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013245 return out;
13246 }
13247 buf1 = PyUnicode_DATA(str_obj);
13248 buf2 = PyUnicode_DATA(sep_obj);
13249 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013250 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013251 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013252 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013253 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013254
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013255 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013256 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013257 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13258 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13259 else
13260 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013261 break;
13262 case PyUnicode_2BYTE_KIND:
13263 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13264 break;
13265 case PyUnicode_4BYTE_KIND:
13266 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13267 break;
13268 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013269 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013270 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013271
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013272 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013273 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013274 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013275
13276 return out;
13277}
13278
13279
13280PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013281PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013282{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013283 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013284 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013285 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013286 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013287
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013288 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013289 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013290
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013291 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013292 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013293 len1 = PyUnicode_GET_LENGTH(str_obj);
13294 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013295 if (kind1 < kind2 || len1 < len2) {
13296 _Py_INCREF_UNICODE_EMPTY();
13297 if (!unicode_empty)
13298 out = NULL;
13299 else {
13300 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13301 Py_DECREF(unicode_empty);
13302 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013303 return out;
13304 }
13305 buf1 = PyUnicode_DATA(str_obj);
13306 buf2 = PyUnicode_DATA(sep_obj);
13307 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013308 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013309 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013310 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013311 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013312
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013313 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013314 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013315 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13316 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13317 else
13318 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013319 break;
13320 case PyUnicode_2BYTE_KIND:
13321 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13322 break;
13323 case PyUnicode_4BYTE_KIND:
13324 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13325 break;
13326 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013327 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013328 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013329
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013330 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013331 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013332 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013333
13334 return out;
13335}
13336
INADA Naoki3ae20562017-01-16 20:41:20 +090013337/*[clinic input]
13338str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013339
INADA Naoki3ae20562017-01-16 20:41:20 +090013340 sep: object
13341 /
13342
13343Partition the string into three parts using the given separator.
13344
13345This will search for the separator in the string. If the separator is found,
13346returns a 3-tuple containing the part before the separator, the separator
13347itself, and the part after it.
13348
13349If the separator is not found, returns a 3-tuple containing the original string
13350and two empty strings.
13351[clinic start generated code]*/
13352
13353static PyObject *
13354unicode_partition(PyObject *self, PyObject *sep)
13355/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013356{
INADA Naoki3ae20562017-01-16 20:41:20 +090013357 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013358}
13359
INADA Naoki3ae20562017-01-16 20:41:20 +090013360/*[clinic input]
13361str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013362
INADA Naoki3ae20562017-01-16 20:41:20 +090013363Partition the string into three parts using the given separator.
13364
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013365This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013366the separator is found, returns a 3-tuple containing the part before the
13367separator, the separator itself, and the part after it.
13368
13369If the separator is not found, returns a 3-tuple containing two empty strings
13370and the original string.
13371[clinic start generated code]*/
13372
13373static PyObject *
13374unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013375/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013376{
INADA Naoki3ae20562017-01-16 20:41:20 +090013377 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013378}
13379
Alexander Belopolsky40018472011-02-26 01:02:56 +000013380PyObject *
13381PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013382{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013383 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013384 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013385
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013386 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013387}
13388
INADA Naoki3ae20562017-01-16 20:41:20 +090013389/*[clinic input]
13390str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013391
INADA Naoki3ae20562017-01-16 20:41:20 +090013392Return a list of the words in the string, using sep as the delimiter string.
13393
13394Splits are done starting at the end of the string and working to the front.
13395[clinic start generated code]*/
13396
13397static PyObject *
13398unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13399/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013400{
INADA Naoki3ae20562017-01-16 20:41:20 +090013401 if (sep == Py_None)
13402 return rsplit(self, NULL, maxsplit);
13403 if (PyUnicode_Check(sep))
13404 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013405
Victor Stinner998b8062018-09-12 00:23:25 +020013406 PyErr_Format(PyExc_TypeError,
13407 "must be str or None, not %.100s",
13408 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013409 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013410}
13411
INADA Naoki3ae20562017-01-16 20:41:20 +090013412/*[clinic input]
13413str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013414
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013415 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013416
13417Return a list of the lines in the string, breaking at line boundaries.
13418
13419Line breaks are not included in the resulting list unless keepends is given and
13420true.
13421[clinic start generated code]*/
13422
13423static PyObject *
13424unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013425/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013426{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013427 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013428}
13429
13430static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013431PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013432{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013433 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013434}
13435
INADA Naoki3ae20562017-01-16 20:41:20 +090013436/*[clinic input]
13437str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013438
INADA Naoki3ae20562017-01-16 20:41:20 +090013439Convert uppercase characters to lowercase and lowercase characters to uppercase.
13440[clinic start generated code]*/
13441
13442static PyObject *
13443unicode_swapcase_impl(PyObject *self)
13444/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013445{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013446 if (PyUnicode_READY(self) == -1)
13447 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013448 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013449}
13450
Larry Hastings61272b72014-01-07 12:41:53 -080013451/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013452
Larry Hastings31826802013-10-19 00:09:25 -070013453@staticmethod
13454str.maketrans as unicode_maketrans
13455
13456 x: object
13457
13458 y: unicode=NULL
13459
13460 z: unicode=NULL
13461
13462 /
13463
13464Return a translation table usable for str.translate().
13465
13466If there is only one argument, it must be a dictionary mapping Unicode
13467ordinals (integers) or characters to Unicode ordinals, strings or None.
13468Character keys will be then converted to ordinals.
13469If there are two arguments, they must be strings of equal length, and
13470in the resulting dictionary, each character in x will be mapped to the
13471character at the same position in y. If there is a third argument, it
13472must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013473[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013474
Larry Hastings31826802013-10-19 00:09:25 -070013475static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013476unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013477/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013478{
Georg Brandlceee0772007-11-27 23:48:05 +000013479 PyObject *new = NULL, *key, *value;
13480 Py_ssize_t i = 0;
13481 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013482
Georg Brandlceee0772007-11-27 23:48:05 +000013483 new = PyDict_New();
13484 if (!new)
13485 return NULL;
13486 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013487 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013488 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013489
Georg Brandlceee0772007-11-27 23:48:05 +000013490 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013491 if (!PyUnicode_Check(x)) {
13492 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13493 "be a string if there is a second argument");
13494 goto err;
13495 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013496 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013497 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13498 "arguments must have equal length");
13499 goto err;
13500 }
13501 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013502 x_kind = PyUnicode_KIND(x);
13503 y_kind = PyUnicode_KIND(y);
13504 x_data = PyUnicode_DATA(x);
13505 y_data = PyUnicode_DATA(y);
13506 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13507 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013508 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013509 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013510 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013511 if (!value) {
13512 Py_DECREF(key);
13513 goto err;
13514 }
Georg Brandlceee0772007-11-27 23:48:05 +000013515 res = PyDict_SetItem(new, key, value);
13516 Py_DECREF(key);
13517 Py_DECREF(value);
13518 if (res < 0)
13519 goto err;
13520 }
13521 /* create entries for deleting chars in z */
13522 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013523 z_kind = PyUnicode_KIND(z);
13524 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013525 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013526 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013527 if (!key)
13528 goto err;
13529 res = PyDict_SetItem(new, key, Py_None);
13530 Py_DECREF(key);
13531 if (res < 0)
13532 goto err;
13533 }
13534 }
13535 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013536 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013537 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013538
Georg Brandlceee0772007-11-27 23:48:05 +000013539 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013540 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013541 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13542 "to maketrans it must be a dict");
13543 goto err;
13544 }
13545 /* copy entries into the new dict, converting string keys to int keys */
13546 while (PyDict_Next(x, &i, &key, &value)) {
13547 if (PyUnicode_Check(key)) {
13548 /* convert string keys to integer keys */
13549 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013550 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013551 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13552 "table must be of length 1");
13553 goto err;
13554 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013555 kind = PyUnicode_KIND(key);
13556 data = PyUnicode_DATA(key);
13557 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013558 if (!newkey)
13559 goto err;
13560 res = PyDict_SetItem(new, newkey, value);
13561 Py_DECREF(newkey);
13562 if (res < 0)
13563 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013564 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013565 /* just keep integer keys */
13566 if (PyDict_SetItem(new, key, value) < 0)
13567 goto err;
13568 } else {
13569 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13570 "be strings or integers");
13571 goto err;
13572 }
13573 }
13574 }
13575 return new;
13576 err:
13577 Py_DECREF(new);
13578 return NULL;
13579}
13580
INADA Naoki3ae20562017-01-16 20:41:20 +090013581/*[clinic input]
13582str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013583
INADA Naoki3ae20562017-01-16 20:41:20 +090013584 table: object
13585 Translation table, which must be a mapping of Unicode ordinals to
13586 Unicode ordinals, strings, or None.
13587 /
13588
13589Replace each character in the string using the given translation table.
13590
13591The table must implement lookup/indexing via __getitem__, for instance a
13592dictionary or list. If this operation raises LookupError, the character is
13593left untouched. Characters mapped to None are deleted.
13594[clinic start generated code]*/
13595
13596static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013597unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013598/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013599{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013600 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013601}
13602
INADA Naoki3ae20562017-01-16 20:41:20 +090013603/*[clinic input]
13604str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013605
INADA Naoki3ae20562017-01-16 20:41:20 +090013606Return a copy of the string converted to uppercase.
13607[clinic start generated code]*/
13608
13609static PyObject *
13610unicode_upper_impl(PyObject *self)
13611/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013612{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013613 if (PyUnicode_READY(self) == -1)
13614 return NULL;
13615 if (PyUnicode_IS_ASCII(self))
13616 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013617 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013618}
13619
INADA Naoki3ae20562017-01-16 20:41:20 +090013620/*[clinic input]
13621str.zfill as unicode_zfill
13622
13623 width: Py_ssize_t
13624 /
13625
13626Pad a numeric string with zeros on the left, to fill a field of the given width.
13627
13628The string is never truncated.
13629[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013630
13631static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013632unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013633/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013634{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013635 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013636 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013637 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013638 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013639 Py_UCS4 chr;
13640
Benjamin Petersonbac79492012-01-14 13:34:47 -050013641 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013642 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013643
Victor Stinnerc4b49542011-12-11 22:44:26 +010013644 if (PyUnicode_GET_LENGTH(self) >= width)
13645 return unicode_result_unchanged(self);
13646
13647 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013648
13649 u = pad(self, fill, 0, '0');
13650
Walter Dörwald068325e2002-04-15 13:36:47 +000013651 if (u == NULL)
13652 return NULL;
13653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013654 kind = PyUnicode_KIND(u);
13655 data = PyUnicode_DATA(u);
13656 chr = PyUnicode_READ(kind, data, fill);
13657
13658 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013659 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013660 PyUnicode_WRITE(kind, data, 0, chr);
13661 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013662 }
13663
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013664 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013665 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013666}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013667
13668#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013669static PyObject *
13670unicode__decimal2ascii(PyObject *self)
13671{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013672 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013673}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013674#endif
13675
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013676PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013677 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013678\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013679Return True if S starts with the specified prefix, False otherwise.\n\
13680With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013681With optional end, stop comparing S at that position.\n\
13682prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013683
13684static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013685unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013686 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013687{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013688 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013689 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013690 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013691 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013692 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013693
Jesus Ceaac451502011-04-20 17:09:23 +020013694 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013695 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013696 if (PyTuple_Check(subobj)) {
13697 Py_ssize_t i;
13698 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013699 substring = PyTuple_GET_ITEM(subobj, i);
13700 if (!PyUnicode_Check(substring)) {
13701 PyErr_Format(PyExc_TypeError,
13702 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013703 "not %.100s",
13704 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013705 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013706 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013707 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013708 if (result == -1)
13709 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013710 if (result) {
13711 Py_RETURN_TRUE;
13712 }
13713 }
13714 /* nothing matched */
13715 Py_RETURN_FALSE;
13716 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013717 if (!PyUnicode_Check(subobj)) {
13718 PyErr_Format(PyExc_TypeError,
13719 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013720 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013721 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013722 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013723 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013724 if (result == -1)
13725 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013726 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013727}
13728
13729
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013730PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013731 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013732\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013733Return True if S ends with the specified suffix, False otherwise.\n\
13734With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013735With optional end, stop comparing S at that position.\n\
13736suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013737
13738static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013739unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013740 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013741{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013742 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013743 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013744 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013745 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013746 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013747
Jesus Ceaac451502011-04-20 17:09:23 +020013748 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013749 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013750 if (PyTuple_Check(subobj)) {
13751 Py_ssize_t i;
13752 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013753 substring = PyTuple_GET_ITEM(subobj, i);
13754 if (!PyUnicode_Check(substring)) {
13755 PyErr_Format(PyExc_TypeError,
13756 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013757 "not %.100s",
13758 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013759 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013760 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013761 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013762 if (result == -1)
13763 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013764 if (result) {
13765 Py_RETURN_TRUE;
13766 }
13767 }
13768 Py_RETURN_FALSE;
13769 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013770 if (!PyUnicode_Check(subobj)) {
13771 PyErr_Format(PyExc_TypeError,
13772 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013773 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013774 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013775 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013776 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013777 if (result == -1)
13778 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013779 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013780}
13781
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013782static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013783_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013784{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013785 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13786 writer->data = PyUnicode_DATA(writer->buffer);
13787
13788 if (!writer->readonly) {
13789 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013790 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013791 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013792 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013793 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13794 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13795 writer->kind = PyUnicode_WCHAR_KIND;
13796 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13797
Victor Stinner8f674cc2013-04-17 23:02:17 +020013798 /* Copy-on-write mode: set buffer size to 0 so
13799 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13800 * next write. */
13801 writer->size = 0;
13802 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013803}
13804
Victor Stinnerd3f08822012-05-29 12:57:52 +020013805void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013806_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013807{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013808 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013809
13810 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013811 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013812
13813 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13814 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13815 writer->kind = PyUnicode_WCHAR_KIND;
13816 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013817}
13818
Inada Naoki770847a2019-06-24 12:30:24 +090013819// Initialize _PyUnicodeWriter with initial buffer
13820static inline void
13821_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13822{
13823 memset(writer, 0, sizeof(*writer));
13824 writer->buffer = buffer;
13825 _PyUnicodeWriter_Update(writer);
13826 writer->min_length = writer->size;
13827}
13828
Victor Stinnerd3f08822012-05-29 12:57:52 +020013829int
13830_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13831 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013832{
13833 Py_ssize_t newlen;
13834 PyObject *newbuffer;
13835
Victor Stinner2740e462016-09-06 16:58:36 -070013836 assert(maxchar <= MAX_UNICODE);
13837
Victor Stinnerca9381e2015-09-22 00:58:32 +020013838 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013839 assert((maxchar > writer->maxchar && length >= 0)
13840 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013841
Victor Stinner202fdca2012-05-07 12:47:02 +020013842 if (length > PY_SSIZE_T_MAX - writer->pos) {
13843 PyErr_NoMemory();
13844 return -1;
13845 }
13846 newlen = writer->pos + length;
13847
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013848 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013849
Victor Stinnerd3f08822012-05-29 12:57:52 +020013850 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013851 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013852 if (writer->overallocate
13853 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13854 /* overallocate to limit the number of realloc() */
13855 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013856 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013857 if (newlen < writer->min_length)
13858 newlen = writer->min_length;
13859
Victor Stinnerd3f08822012-05-29 12:57:52 +020013860 writer->buffer = PyUnicode_New(newlen, maxchar);
13861 if (writer->buffer == NULL)
13862 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013863 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013864 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013865 if (writer->overallocate
13866 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13867 /* overallocate to limit the number of realloc() */
13868 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013869 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013870 if (newlen < writer->min_length)
13871 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013872
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013873 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013874 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013875 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013876 newbuffer = PyUnicode_New(newlen, maxchar);
13877 if (newbuffer == NULL)
13878 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013879 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13880 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013881 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013882 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013883 }
13884 else {
13885 newbuffer = resize_compact(writer->buffer, newlen);
13886 if (newbuffer == NULL)
13887 return -1;
13888 }
13889 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013890 }
13891 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013892 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013893 newbuffer = PyUnicode_New(writer->size, maxchar);
13894 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013895 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013896 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13897 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013898 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013899 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013900 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013901 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013902
13903#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013904}
13905
Victor Stinnerca9381e2015-09-22 00:58:32 +020013906int
13907_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13908 enum PyUnicode_Kind kind)
13909{
13910 Py_UCS4 maxchar;
13911
13912 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13913 assert(writer->kind < kind);
13914
13915 switch (kind)
13916 {
13917 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13918 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13919 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13920 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013921 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013922 }
13923
13924 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13925}
13926
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013927static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013928_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013929{
Victor Stinner2740e462016-09-06 16:58:36 -070013930 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013931 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13932 return -1;
13933 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13934 writer->pos++;
13935 return 0;
13936}
13937
13938int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013939_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13940{
13941 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13942}
13943
13944int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013945_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13946{
13947 Py_UCS4 maxchar;
13948 Py_ssize_t len;
13949
13950 if (PyUnicode_READY(str) == -1)
13951 return -1;
13952 len = PyUnicode_GET_LENGTH(str);
13953 if (len == 0)
13954 return 0;
13955 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13956 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013957 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013958 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013959 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013960 Py_INCREF(str);
13961 writer->buffer = str;
13962 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013963 writer->pos += len;
13964 return 0;
13965 }
13966 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13967 return -1;
13968 }
13969 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13970 str, 0, len);
13971 writer->pos += len;
13972 return 0;
13973}
13974
Victor Stinnere215d962012-10-06 23:03:36 +020013975int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013976_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13977 Py_ssize_t start, Py_ssize_t end)
13978{
13979 Py_UCS4 maxchar;
13980 Py_ssize_t len;
13981
13982 if (PyUnicode_READY(str) == -1)
13983 return -1;
13984
13985 assert(0 <= start);
13986 assert(end <= PyUnicode_GET_LENGTH(str));
13987 assert(start <= end);
13988
13989 if (end == 0)
13990 return 0;
13991
13992 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13993 return _PyUnicodeWriter_WriteStr(writer, str);
13994
13995 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13996 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13997 else
13998 maxchar = writer->maxchar;
13999 len = end - start;
14000
14001 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14002 return -1;
14003
14004 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14005 str, start, len);
14006 writer->pos += len;
14007 return 0;
14008}
14009
14010int
Victor Stinner4a587072013-11-19 12:54:53 +010014011_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14012 const char *ascii, Py_ssize_t len)
14013{
14014 if (len == -1)
14015 len = strlen(ascii);
14016
Andy Lestere6be9b52020-02-11 20:28:35 -060014017 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010014018
14019 if (writer->buffer == NULL && !writer->overallocate) {
14020 PyObject *str;
14021
14022 str = _PyUnicode_FromASCII(ascii, len);
14023 if (str == NULL)
14024 return -1;
14025
14026 writer->readonly = 1;
14027 writer->buffer = str;
14028 _PyUnicodeWriter_Update(writer);
14029 writer->pos += len;
14030 return 0;
14031 }
14032
14033 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14034 return -1;
14035
14036 switch (writer->kind)
14037 {
14038 case PyUnicode_1BYTE_KIND:
14039 {
14040 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14041 Py_UCS1 *data = writer->data;
14042
Christian Heimesf051e432016-09-13 20:22:02 +020014043 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010014044 break;
14045 }
14046 case PyUnicode_2BYTE_KIND:
14047 {
14048 _PyUnicode_CONVERT_BYTES(
14049 Py_UCS1, Py_UCS2,
14050 ascii, ascii + len,
14051 (Py_UCS2 *)writer->data + writer->pos);
14052 break;
14053 }
14054 case PyUnicode_4BYTE_KIND:
14055 {
14056 _PyUnicode_CONVERT_BYTES(
14057 Py_UCS1, Py_UCS4,
14058 ascii, ascii + len,
14059 (Py_UCS4 *)writer->data + writer->pos);
14060 break;
14061 }
14062 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014063 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010014064 }
14065
14066 writer->pos += len;
14067 return 0;
14068}
14069
14070int
14071_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14072 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020014073{
14074 Py_UCS4 maxchar;
14075
Andy Lestere6be9b52020-02-11 20:28:35 -060014076 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020014077 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14078 return -1;
14079 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14080 writer->pos += len;
14081 return 0;
14082}
14083
Victor Stinnerd3f08822012-05-29 12:57:52 +020014084PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014085_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014086{
Victor Stinner15a0bd32013-07-08 22:29:55 +020014087 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014088
Victor Stinnerd3f08822012-05-29 12:57:52 +020014089 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014090 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020014091 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020014092 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014093
14094 str = writer->buffer;
14095 writer->buffer = NULL;
14096
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014097 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014098 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14099 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014100 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014101
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014102 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14103 PyObject *str2;
14104 str2 = resize_compact(str, writer->pos);
14105 if (str2 == NULL) {
14106 Py_DECREF(str);
14107 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014108 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014109 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014110 }
14111
Victor Stinner15a0bd32013-07-08 22:29:55 +020014112 assert(_PyUnicode_CheckConsistency(str, 1));
14113 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020014114}
14115
Victor Stinnerd3f08822012-05-29 12:57:52 +020014116void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014117_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014118{
14119 Py_CLEAR(writer->buffer);
14120}
14121
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014122#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000014123
14124PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000014125 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000014126\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014127Return a formatted version of S, using substitutions from args and kwargs.\n\
14128The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000014129
Eric Smith27bbca62010-11-04 17:06:58 +000014130PyDoc_STRVAR(format_map__doc__,
14131 "S.format_map(mapping) -> str\n\
14132\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014133Return a formatted version of S, using substitutions from mapping.\n\
14134The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014135
INADA Naoki3ae20562017-01-16 20:41:20 +090014136/*[clinic input]
14137str.__format__ as unicode___format__
14138
14139 format_spec: unicode
14140 /
14141
14142Return a formatted version of the string as described by format_spec.
14143[clinic start generated code]*/
14144
Eric Smith4a7d76d2008-05-30 18:10:19 +000014145static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014146unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014147/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014148{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014149 _PyUnicodeWriter writer;
14150 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014151
Victor Stinnerd3f08822012-05-29 12:57:52 +020014152 if (PyUnicode_READY(self) == -1)
14153 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014154 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014155 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14156 self, format_spec, 0,
14157 PyUnicode_GET_LENGTH(format_spec));
14158 if (ret == -1) {
14159 _PyUnicodeWriter_Dealloc(&writer);
14160 return NULL;
14161 }
14162 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014163}
14164
INADA Naoki3ae20562017-01-16 20:41:20 +090014165/*[clinic input]
14166str.__sizeof__ as unicode_sizeof
14167
14168Return the size of the string in memory, in bytes.
14169[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014170
14171static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014172unicode_sizeof_impl(PyObject *self)
14173/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014174{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014175 Py_ssize_t size;
14176
14177 /* If it's a compact object, account for base structure +
14178 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014179 if (PyUnicode_IS_COMPACT_ASCII(self))
14180 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14181 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014182 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014183 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014184 else {
14185 /* If it is a two-block object, account for base object, and
14186 for character block if present. */
14187 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014188 if (_PyUnicode_DATA_ANY(self))
14189 size += (PyUnicode_GET_LENGTH(self) + 1) *
14190 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014191 }
14192 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014193 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014194 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14195 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14196 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14197 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014198
14199 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014200}
14201
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014202static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014203unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014204{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014205 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014206 if (!copy)
14207 return NULL;
14208 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014209}
14210
Guido van Rossumd57fd912000-03-10 22:53:23 +000014211static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014212 UNICODE_ENCODE_METHODDEF
14213 UNICODE_REPLACE_METHODDEF
14214 UNICODE_SPLIT_METHODDEF
14215 UNICODE_RSPLIT_METHODDEF
14216 UNICODE_JOIN_METHODDEF
14217 UNICODE_CAPITALIZE_METHODDEF
14218 UNICODE_CASEFOLD_METHODDEF
14219 UNICODE_TITLE_METHODDEF
14220 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014221 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014222 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014223 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014224 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014225 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014226 UNICODE_LJUST_METHODDEF
14227 UNICODE_LOWER_METHODDEF
14228 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014229 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14230 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014231 UNICODE_RJUST_METHODDEF
14232 UNICODE_RSTRIP_METHODDEF
14233 UNICODE_RPARTITION_METHODDEF
14234 UNICODE_SPLITLINES_METHODDEF
14235 UNICODE_STRIP_METHODDEF
14236 UNICODE_SWAPCASE_METHODDEF
14237 UNICODE_TRANSLATE_METHODDEF
14238 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014239 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14240 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
sweeneydea81849b2020-04-22 17:05:48 -040014241 UNICODE_REMOVEPREFIX_METHODDEF
14242 UNICODE_REMOVESUFFIX_METHODDEF
INADA Naokia49ac992018-01-27 14:06:21 +090014243 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014244 UNICODE_ISLOWER_METHODDEF
14245 UNICODE_ISUPPER_METHODDEF
14246 UNICODE_ISTITLE_METHODDEF
14247 UNICODE_ISSPACE_METHODDEF
14248 UNICODE_ISDECIMAL_METHODDEF
14249 UNICODE_ISDIGIT_METHODDEF
14250 UNICODE_ISNUMERIC_METHODDEF
14251 UNICODE_ISALPHA_METHODDEF
14252 UNICODE_ISALNUM_METHODDEF
14253 UNICODE_ISIDENTIFIER_METHODDEF
14254 UNICODE_ISPRINTABLE_METHODDEF
14255 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014256 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014257 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014258 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014259 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014260 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014261#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014262 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014263 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014264#endif
14265
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014266 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014267 {NULL, NULL}
14268};
14269
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014270static PyObject *
14271unicode_mod(PyObject *v, PyObject *w)
14272{
Brian Curtindfc80e32011-08-10 20:28:54 -050014273 if (!PyUnicode_Check(v))
14274 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014275 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014276}
14277
14278static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014279 0, /*nb_add*/
14280 0, /*nb_subtract*/
14281 0, /*nb_multiply*/
14282 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014283};
14284
Guido van Rossumd57fd912000-03-10 22:53:23 +000014285static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014286 (lenfunc) unicode_length, /* sq_length */
14287 PyUnicode_Concat, /* sq_concat */
14288 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14289 (ssizeargfunc) unicode_getitem, /* sq_item */
14290 0, /* sq_slice */
14291 0, /* sq_ass_item */
14292 0, /* sq_ass_slice */
14293 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014294};
14295
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014296static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014297unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014298{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014299 if (PyUnicode_READY(self) == -1)
14300 return NULL;
14301
Victor Stinnera15e2602020-04-08 02:01:56 +020014302 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014303 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014304 if (i == -1 && PyErr_Occurred())
14305 return NULL;
14306 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014307 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014308 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014309 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014310 Py_ssize_t start, stop, step, slicelength, i;
14311 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014312 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014313 const void *src_data;
14314 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014315 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014316 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014317
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014318 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014319 return NULL;
14320 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014321 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14322 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014323
14324 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014325 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014326 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014327 slicelength == PyUnicode_GET_LENGTH(self)) {
14328 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014329 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014330 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014331 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014332 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014333 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014334 src_kind = PyUnicode_KIND(self);
14335 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014336 if (!PyUnicode_IS_ASCII(self)) {
14337 kind_limit = kind_maxchar_limit(src_kind);
14338 max_char = 0;
14339 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14340 ch = PyUnicode_READ(src_kind, src_data, cur);
14341 if (ch > max_char) {
14342 max_char = ch;
14343 if (max_char >= kind_limit)
14344 break;
14345 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014346 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014347 }
Victor Stinner55c99112011-10-13 01:17:06 +020014348 else
14349 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014350 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014351 if (result == NULL)
14352 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014353 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014354 dest_data = PyUnicode_DATA(result);
14355
14356 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014357 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14358 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014359 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014360 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014361 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014362 } else {
14363 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14364 return NULL;
14365 }
14366}
14367
14368static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014369 (lenfunc)unicode_length, /* mp_length */
14370 (binaryfunc)unicode_subscript, /* mp_subscript */
14371 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014372};
14373
Guido van Rossumd57fd912000-03-10 22:53:23 +000014374
Guido van Rossumd57fd912000-03-10 22:53:23 +000014375/* Helpers for PyUnicode_Format() */
14376
Victor Stinnera47082312012-10-04 02:19:54 +020014377struct unicode_formatter_t {
14378 PyObject *args;
14379 int args_owned;
14380 Py_ssize_t arglen, argidx;
14381 PyObject *dict;
14382
14383 enum PyUnicode_Kind fmtkind;
14384 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014385 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014386 PyObject *fmtstr;
14387
14388 _PyUnicodeWriter writer;
14389};
14390
14391struct unicode_format_arg_t {
14392 Py_UCS4 ch;
14393 int flags;
14394 Py_ssize_t width;
14395 int prec;
14396 int sign;
14397};
14398
Guido van Rossumd57fd912000-03-10 22:53:23 +000014399static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014400unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014401{
Victor Stinnera47082312012-10-04 02:19:54 +020014402 Py_ssize_t argidx = ctx->argidx;
14403
14404 if (argidx < ctx->arglen) {
14405 ctx->argidx++;
14406 if (ctx->arglen < 0)
14407 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014408 else
Victor Stinnera47082312012-10-04 02:19:54 +020014409 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014410 }
14411 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014412 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014413 return NULL;
14414}
14415
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014416/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014417
Victor Stinnera47082312012-10-04 02:19:54 +020014418/* Format a float into the writer if the writer is not NULL, or into *p_output
14419 otherwise.
14420
14421 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014422static int
Victor Stinnera47082312012-10-04 02:19:54 +020014423formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14424 PyObject **p_output,
14425 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014426{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014427 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014428 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014429 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014430 int prec;
14431 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014432
Guido van Rossumd57fd912000-03-10 22:53:23 +000014433 x = PyFloat_AsDouble(v);
14434 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014435 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014436
Victor Stinnera47082312012-10-04 02:19:54 +020014437 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014438 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014439 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014440
Victor Stinnera47082312012-10-04 02:19:54 +020014441 if (arg->flags & F_ALT)
14442 dtoa_flags = Py_DTSF_ALT;
14443 else
14444 dtoa_flags = 0;
14445 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014446 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014447 return -1;
14448 len = strlen(p);
14449 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014450 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014451 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014452 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014453 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014454 }
14455 else
14456 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014457 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014458 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014459}
14460
Victor Stinnerd0880d52012-04-27 23:40:13 +020014461/* formatlong() emulates the format codes d, u, o, x and X, and
14462 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14463 * Python's regular ints.
14464 * Return value: a new PyUnicodeObject*, or NULL if error.
14465 * The output string is of the form
14466 * "-"? ("0x" | "0X")? digit+
14467 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14468 * set in flags. The case of hex digits will be correct,
14469 * There will be at least prec digits, zero-filled on the left if
14470 * necessary to get that many.
14471 * val object to be converted
14472 * flags bitmask of format flags; only F_ALT is looked at
14473 * prec minimum number of digits; 0-fill on left if needed
14474 * type a character in [duoxX]; u acts the same as d
14475 *
14476 * CAUTION: o, x and X conversions on regular ints can never
14477 * produce a '-' sign, but can for Python's unbounded ints.
14478 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014479PyObject *
14480_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014481{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014482 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014483 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014484 Py_ssize_t i;
14485 int sign; /* 1 if '-', else 0 */
14486 int len; /* number of characters */
14487 Py_ssize_t llen;
14488 int numdigits; /* len == numnondigits + numdigits */
14489 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014490
Victor Stinnerd0880d52012-04-27 23:40:13 +020014491 /* Avoid exceeding SSIZE_T_MAX */
14492 if (prec > INT_MAX-3) {
14493 PyErr_SetString(PyExc_OverflowError,
14494 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014495 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014496 }
14497
14498 assert(PyLong_Check(val));
14499
14500 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014501 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014502 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014503 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014504 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014505 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014506 /* int and int subclasses should print numerically when a numeric */
14507 /* format code is used (see issue18780) */
14508 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014509 break;
14510 case 'o':
14511 numnondigits = 2;
14512 result = PyNumber_ToBase(val, 8);
14513 break;
14514 case 'x':
14515 case 'X':
14516 numnondigits = 2;
14517 result = PyNumber_ToBase(val, 16);
14518 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014519 }
14520 if (!result)
14521 return NULL;
14522
14523 assert(unicode_modifiable(result));
14524 assert(PyUnicode_IS_READY(result));
14525 assert(PyUnicode_IS_ASCII(result));
14526
14527 /* To modify the string in-place, there can only be one reference. */
14528 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014529 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014530 PyErr_BadInternalCall();
14531 return NULL;
14532 }
14533 buf = PyUnicode_DATA(result);
14534 llen = PyUnicode_GET_LENGTH(result);
14535 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014536 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014537 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014538 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014539 return NULL;
14540 }
14541 len = (int)llen;
14542 sign = buf[0] == '-';
14543 numnondigits += sign;
14544 numdigits = len - numnondigits;
14545 assert(numdigits > 0);
14546
14547 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014548 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014549 (type == 'o' || type == 'x' || type == 'X'))) {
14550 assert(buf[sign] == '0');
14551 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14552 buf[sign+1] == 'o');
14553 numnondigits -= 2;
14554 buf += 2;
14555 len -= 2;
14556 if (sign)
14557 buf[0] = '-';
14558 assert(len == numnondigits + numdigits);
14559 assert(numdigits > 0);
14560 }
14561
14562 /* Fill with leading zeroes to meet minimum width. */
14563 if (prec > numdigits) {
14564 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14565 numnondigits + prec);
14566 char *b1;
14567 if (!r1) {
14568 Py_DECREF(result);
14569 return NULL;
14570 }
14571 b1 = PyBytes_AS_STRING(r1);
14572 for (i = 0; i < numnondigits; ++i)
14573 *b1++ = *buf++;
14574 for (i = 0; i < prec - numdigits; i++)
14575 *b1++ = '0';
14576 for (i = 0; i < numdigits; i++)
14577 *b1++ = *buf++;
14578 *b1 = '\0';
14579 Py_DECREF(result);
14580 result = r1;
14581 buf = PyBytes_AS_STRING(result);
14582 len = numnondigits + prec;
14583 }
14584
14585 /* Fix up case for hex conversions. */
14586 if (type == 'X') {
14587 /* Need to convert all lower case letters to upper case.
14588 and need to convert 0x to 0X (and -0x to -0X). */
14589 for (i = 0; i < len; i++)
14590 if (buf[i] >= 'a' && buf[i] <= 'x')
14591 buf[i] -= 'a'-'A';
14592 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014593 if (!PyUnicode_Check(result)
14594 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014595 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014596 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014597 Py_DECREF(result);
14598 result = unicode;
14599 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014600 else if (len != PyUnicode_GET_LENGTH(result)) {
14601 if (PyUnicode_Resize(&result, len) < 0)
14602 Py_CLEAR(result);
14603 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014604 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014605}
14606
Ethan Furmandf3ed242014-01-05 06:50:30 -080014607/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014608 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014609 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014610 * -1 and raise an exception on error */
14611static int
Victor Stinnera47082312012-10-04 02:19:54 +020014612mainformatlong(PyObject *v,
14613 struct unicode_format_arg_t *arg,
14614 PyObject **p_output,
14615 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014616{
14617 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014618 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014619
14620 if (!PyNumber_Check(v))
14621 goto wrongtype;
14622
Ethan Furman9ab74802014-03-21 06:38:46 -070014623 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014624 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014625 if (type == 'o' || type == 'x' || type == 'X') {
Serhiy Storchaka5f4b229d2020-05-28 10:33:45 +030014626 iobj = _PyNumber_Index(v);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014627 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014628 if (PyErr_ExceptionMatches(PyExc_TypeError))
14629 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014630 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014631 }
14632 }
14633 else {
14634 iobj = PyNumber_Long(v);
14635 if (iobj == NULL ) {
14636 if (PyErr_ExceptionMatches(PyExc_TypeError))
14637 goto wrongtype;
14638 return -1;
14639 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014640 }
14641 assert(PyLong_Check(iobj));
14642 }
14643 else {
14644 iobj = v;
14645 Py_INCREF(iobj);
14646 }
14647
14648 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014649 && arg->width == -1 && arg->prec == -1
14650 && !(arg->flags & (F_SIGN | F_BLANK))
14651 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014652 {
14653 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014654 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014655 int base;
14656
Victor Stinnera47082312012-10-04 02:19:54 +020014657 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014658 {
14659 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014660 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014661 case 'd':
14662 case 'i':
14663 case 'u':
14664 base = 10;
14665 break;
14666 case 'o':
14667 base = 8;
14668 break;
14669 case 'x':
14670 case 'X':
14671 base = 16;
14672 break;
14673 }
14674
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014675 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14676 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014677 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014678 }
14679 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014680 return 1;
14681 }
14682
Ethan Furmanb95b5612015-01-23 20:05:18 -080014683 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014684 Py_DECREF(iobj);
14685 if (res == NULL)
14686 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014687 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014688 return 0;
14689
14690wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014691 switch(type)
14692 {
14693 case 'o':
14694 case 'x':
14695 case 'X':
14696 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014697 "%%%c format: an integer is required, "
14698 "not %.200s",
14699 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014700 break;
14701 default:
14702 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014703 "%%%c format: a number is required, "
14704 "not %.200s",
14705 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014706 break;
14707 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014708 return -1;
14709}
14710
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014711static Py_UCS4
14712formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014713{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014714 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014715 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014716 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014717 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014718 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014719 goto onError;
14720 }
14721 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014722 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014723 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014724 /* make sure number is a type of integer */
14725 if (!PyLong_Check(v)) {
14726 iobj = PyNumber_Index(v);
14727 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014728 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014729 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014730 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014731 Py_DECREF(iobj);
14732 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014733 else {
14734 x = PyLong_AsLong(v);
14735 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014736 if (x == -1 && PyErr_Occurred())
14737 goto onError;
14738
Victor Stinner8faf8212011-12-08 22:14:11 +010014739 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014740 PyErr_SetString(PyExc_OverflowError,
14741 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014742 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014743 }
14744
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014745 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014746 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014747
Benjamin Peterson29060642009-01-31 22:14:21 +000014748 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014749 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014750 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014751 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014752}
14753
Victor Stinnera47082312012-10-04 02:19:54 +020014754/* Parse options of an argument: flags, width, precision.
14755 Handle also "%(name)" syntax.
14756
14757 Return 0 if the argument has been formatted into arg->str.
14758 Return 1 if the argument has been written into ctx->writer,
14759 Raise an exception and return -1 on error. */
14760static int
14761unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14762 struct unicode_format_arg_t *arg)
14763{
14764#define FORMAT_READ(ctx) \
14765 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14766
14767 PyObject *v;
14768
Victor Stinnera47082312012-10-04 02:19:54 +020014769 if (arg->ch == '(') {
14770 /* Get argument value from a dictionary. Example: "%(name)s". */
14771 Py_ssize_t keystart;
14772 Py_ssize_t keylen;
14773 PyObject *key;
14774 int pcount = 1;
14775
14776 if (ctx->dict == NULL) {
14777 PyErr_SetString(PyExc_TypeError,
14778 "format requires a mapping");
14779 return -1;
14780 }
14781 ++ctx->fmtpos;
14782 --ctx->fmtcnt;
14783 keystart = ctx->fmtpos;
14784 /* Skip over balanced parentheses */
14785 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14786 arg->ch = FORMAT_READ(ctx);
14787 if (arg->ch == ')')
14788 --pcount;
14789 else if (arg->ch == '(')
14790 ++pcount;
14791 ctx->fmtpos++;
14792 }
14793 keylen = ctx->fmtpos - keystart - 1;
14794 if (ctx->fmtcnt < 0 || pcount > 0) {
14795 PyErr_SetString(PyExc_ValueError,
14796 "incomplete format key");
14797 return -1;
14798 }
14799 key = PyUnicode_Substring(ctx->fmtstr,
14800 keystart, keystart + keylen);
14801 if (key == NULL)
14802 return -1;
14803 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014804 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014805 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014806 }
14807 ctx->args = PyObject_GetItem(ctx->dict, key);
14808 Py_DECREF(key);
14809 if (ctx->args == NULL)
14810 return -1;
14811 ctx->args_owned = 1;
14812 ctx->arglen = -1;
14813 ctx->argidx = -2;
14814 }
14815
14816 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014817 while (--ctx->fmtcnt >= 0) {
14818 arg->ch = FORMAT_READ(ctx);
14819 ctx->fmtpos++;
14820 switch (arg->ch) {
14821 case '-': arg->flags |= F_LJUST; continue;
14822 case '+': arg->flags |= F_SIGN; continue;
14823 case ' ': arg->flags |= F_BLANK; continue;
14824 case '#': arg->flags |= F_ALT; continue;
14825 case '0': arg->flags |= F_ZERO; continue;
14826 }
14827 break;
14828 }
14829
14830 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014831 if (arg->ch == '*') {
14832 v = unicode_format_getnextarg(ctx);
14833 if (v == NULL)
14834 return -1;
14835 if (!PyLong_Check(v)) {
14836 PyErr_SetString(PyExc_TypeError,
14837 "* wants int");
14838 return -1;
14839 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014840 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014841 if (arg->width == -1 && PyErr_Occurred())
14842 return -1;
14843 if (arg->width < 0) {
14844 arg->flags |= F_LJUST;
14845 arg->width = -arg->width;
14846 }
14847 if (--ctx->fmtcnt >= 0) {
14848 arg->ch = FORMAT_READ(ctx);
14849 ctx->fmtpos++;
14850 }
14851 }
14852 else if (arg->ch >= '0' && arg->ch <= '9') {
14853 arg->width = arg->ch - '0';
14854 while (--ctx->fmtcnt >= 0) {
14855 arg->ch = FORMAT_READ(ctx);
14856 ctx->fmtpos++;
14857 if (arg->ch < '0' || arg->ch > '9')
14858 break;
14859 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14860 mixing signed and unsigned comparison. Since arg->ch is between
14861 '0' and '9', casting to int is safe. */
14862 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14863 PyErr_SetString(PyExc_ValueError,
14864 "width too big");
14865 return -1;
14866 }
14867 arg->width = arg->width*10 + (arg->ch - '0');
14868 }
14869 }
14870
14871 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014872 if (arg->ch == '.') {
14873 arg->prec = 0;
14874 if (--ctx->fmtcnt >= 0) {
14875 arg->ch = FORMAT_READ(ctx);
14876 ctx->fmtpos++;
14877 }
14878 if (arg->ch == '*') {
14879 v = unicode_format_getnextarg(ctx);
14880 if (v == NULL)
14881 return -1;
14882 if (!PyLong_Check(v)) {
14883 PyErr_SetString(PyExc_TypeError,
14884 "* wants int");
14885 return -1;
14886 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014887 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014888 if (arg->prec == -1 && PyErr_Occurred())
14889 return -1;
14890 if (arg->prec < 0)
14891 arg->prec = 0;
14892 if (--ctx->fmtcnt >= 0) {
14893 arg->ch = FORMAT_READ(ctx);
14894 ctx->fmtpos++;
14895 }
14896 }
14897 else if (arg->ch >= '0' && arg->ch <= '9') {
14898 arg->prec = arg->ch - '0';
14899 while (--ctx->fmtcnt >= 0) {
14900 arg->ch = FORMAT_READ(ctx);
14901 ctx->fmtpos++;
14902 if (arg->ch < '0' || arg->ch > '9')
14903 break;
14904 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14905 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014906 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014907 return -1;
14908 }
14909 arg->prec = arg->prec*10 + (arg->ch - '0');
14910 }
14911 }
14912 }
14913
14914 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14915 if (ctx->fmtcnt >= 0) {
14916 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14917 if (--ctx->fmtcnt >= 0) {
14918 arg->ch = FORMAT_READ(ctx);
14919 ctx->fmtpos++;
14920 }
14921 }
14922 }
14923 if (ctx->fmtcnt < 0) {
14924 PyErr_SetString(PyExc_ValueError,
14925 "incomplete format");
14926 return -1;
14927 }
14928 return 0;
14929
14930#undef FORMAT_READ
14931}
14932
14933/* Format one argument. Supported conversion specifiers:
14934
14935 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014936 - "i", "d", "u": int or float
14937 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014938 - "e", "E", "f", "F", "g", "G": float
14939 - "c": int or str (1 character)
14940
Victor Stinner8dbd4212012-12-04 09:30:24 +010014941 When possible, the output is written directly into the Unicode writer
14942 (ctx->writer). A string is created when padding is required.
14943
Victor Stinnera47082312012-10-04 02:19:54 +020014944 Return 0 if the argument has been formatted into *p_str,
14945 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014946 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014947static int
14948unicode_format_arg_format(struct unicode_formatter_t *ctx,
14949 struct unicode_format_arg_t *arg,
14950 PyObject **p_str)
14951{
14952 PyObject *v;
14953 _PyUnicodeWriter *writer = &ctx->writer;
14954
14955 if (ctx->fmtcnt == 0)
14956 ctx->writer.overallocate = 0;
14957
Victor Stinnera47082312012-10-04 02:19:54 +020014958 v = unicode_format_getnextarg(ctx);
14959 if (v == NULL)
14960 return -1;
14961
Victor Stinnera47082312012-10-04 02:19:54 +020014962
14963 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014964 case 's':
14965 case 'r':
14966 case 'a':
14967 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14968 /* Fast path */
14969 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14970 return -1;
14971 return 1;
14972 }
14973
14974 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14975 *p_str = v;
14976 Py_INCREF(*p_str);
14977 }
14978 else {
14979 if (arg->ch == 's')
14980 *p_str = PyObject_Str(v);
14981 else if (arg->ch == 'r')
14982 *p_str = PyObject_Repr(v);
14983 else
14984 *p_str = PyObject_ASCII(v);
14985 }
14986 break;
14987
14988 case 'i':
14989 case 'd':
14990 case 'u':
14991 case 'o':
14992 case 'x':
14993 case 'X':
14994 {
14995 int ret = mainformatlong(v, arg, p_str, writer);
14996 if (ret != 0)
14997 return ret;
14998 arg->sign = 1;
14999 break;
15000 }
15001
15002 case 'e':
15003 case 'E':
15004 case 'f':
15005 case 'F':
15006 case 'g':
15007 case 'G':
15008 if (arg->width == -1 && arg->prec == -1
15009 && !(arg->flags & (F_SIGN | F_BLANK)))
15010 {
15011 /* Fast path */
15012 if (formatfloat(v, arg, NULL, writer) == -1)
15013 return -1;
15014 return 1;
15015 }
15016
15017 arg->sign = 1;
15018 if (formatfloat(v, arg, p_str, NULL) == -1)
15019 return -1;
15020 break;
15021
15022 case 'c':
15023 {
15024 Py_UCS4 ch = formatchar(v);
15025 if (ch == (Py_UCS4) -1)
15026 return -1;
15027 if (arg->width == -1 && arg->prec == -1) {
15028 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020015029 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020015030 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020015031 return 1;
15032 }
15033 *p_str = PyUnicode_FromOrdinal(ch);
15034 break;
15035 }
15036
15037 default:
15038 PyErr_Format(PyExc_ValueError,
15039 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020015040 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020015041 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15042 (int)arg->ch,
15043 ctx->fmtpos - 1);
15044 return -1;
15045 }
15046 if (*p_str == NULL)
15047 return -1;
15048 assert (PyUnicode_Check(*p_str));
15049 return 0;
15050}
15051
15052static int
15053unicode_format_arg_output(struct unicode_formatter_t *ctx,
15054 struct unicode_format_arg_t *arg,
15055 PyObject *str)
15056{
15057 Py_ssize_t len;
15058 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015059 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020015060 Py_ssize_t pindex;
15061 Py_UCS4 signchar;
15062 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015063 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015064 Py_ssize_t sublen;
15065 _PyUnicodeWriter *writer = &ctx->writer;
15066 Py_UCS4 fill;
15067
15068 fill = ' ';
15069 if (arg->sign && arg->flags & F_ZERO)
15070 fill = '0';
15071
15072 if (PyUnicode_READY(str) == -1)
15073 return -1;
15074
15075 len = PyUnicode_GET_LENGTH(str);
15076 if ((arg->width == -1 || arg->width <= len)
15077 && (arg->prec == -1 || arg->prec >= len)
15078 && !(arg->flags & (F_SIGN | F_BLANK)))
15079 {
15080 /* Fast path */
15081 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15082 return -1;
15083 return 0;
15084 }
15085
15086 /* Truncate the string for "s", "r" and "a" formats
15087 if the precision is set */
15088 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15089 if (arg->prec >= 0 && len > arg->prec)
15090 len = arg->prec;
15091 }
15092
15093 /* Adjust sign and width */
15094 kind = PyUnicode_KIND(str);
15095 pbuf = PyUnicode_DATA(str);
15096 pindex = 0;
15097 signchar = '\0';
15098 if (arg->sign) {
15099 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15100 if (ch == '-' || ch == '+') {
15101 signchar = ch;
15102 len--;
15103 pindex++;
15104 }
15105 else if (arg->flags & F_SIGN)
15106 signchar = '+';
15107 else if (arg->flags & F_BLANK)
15108 signchar = ' ';
15109 else
15110 arg->sign = 0;
15111 }
15112 if (arg->width < len)
15113 arg->width = len;
15114
15115 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015116 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015117 if (!(arg->flags & F_LJUST)) {
15118 if (arg->sign) {
15119 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015120 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015121 }
15122 else {
15123 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015124 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015125 }
15126 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015127 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15128 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015129 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015130 }
15131
Victor Stinnera47082312012-10-04 02:19:54 +020015132 buflen = arg->width;
15133 if (arg->sign && len == arg->width)
15134 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015135 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015136 return -1;
15137
15138 /* Write the sign if needed */
15139 if (arg->sign) {
15140 if (fill != ' ') {
15141 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15142 writer->pos += 1;
15143 }
15144 if (arg->width > len)
15145 arg->width--;
15146 }
15147
15148 /* Write the numeric prefix for "x", "X" and "o" formats
15149 if the alternate form is used.
15150 For example, write "0x" for the "%#x" format. */
15151 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15152 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15153 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15154 if (fill != ' ') {
15155 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15156 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15157 writer->pos += 2;
15158 pindex += 2;
15159 }
15160 arg->width -= 2;
15161 if (arg->width < 0)
15162 arg->width = 0;
15163 len -= 2;
15164 }
15165
15166 /* Pad left with the fill character if needed */
15167 if (arg->width > len && !(arg->flags & F_LJUST)) {
15168 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015169 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015170 writer->pos += sublen;
15171 arg->width = len;
15172 }
15173
15174 /* If padding with spaces: write sign if needed and/or numeric prefix if
15175 the alternate form is used */
15176 if (fill == ' ') {
15177 if (arg->sign) {
15178 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15179 writer->pos += 1;
15180 }
15181 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15182 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15183 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15184 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15185 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15186 writer->pos += 2;
15187 pindex += 2;
15188 }
15189 }
15190
15191 /* Write characters */
15192 if (len) {
15193 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15194 str, pindex, len);
15195 writer->pos += len;
15196 }
15197
15198 /* Pad right with the fill character if needed */
15199 if (arg->width > len) {
15200 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015201 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015202 writer->pos += sublen;
15203 }
15204 return 0;
15205}
15206
15207/* Helper of PyUnicode_Format(): format one arg.
15208 Return 0 on success, raise an exception and return -1 on error. */
15209static int
15210unicode_format_arg(struct unicode_formatter_t *ctx)
15211{
15212 struct unicode_format_arg_t arg;
15213 PyObject *str;
15214 int ret;
15215
Victor Stinner8dbd4212012-12-04 09:30:24 +010015216 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015217 if (arg.ch == '%') {
15218 ctx->fmtpos++;
15219 ctx->fmtcnt--;
15220 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15221 return -1;
15222 return 0;
15223 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015224 arg.flags = 0;
15225 arg.width = -1;
15226 arg.prec = -1;
15227 arg.sign = 0;
15228 str = NULL;
15229
Victor Stinnera47082312012-10-04 02:19:54 +020015230 ret = unicode_format_arg_parse(ctx, &arg);
15231 if (ret == -1)
15232 return -1;
15233
15234 ret = unicode_format_arg_format(ctx, &arg, &str);
15235 if (ret == -1)
15236 return -1;
15237
15238 if (ret != 1) {
15239 ret = unicode_format_arg_output(ctx, &arg, str);
15240 Py_DECREF(str);
15241 if (ret == -1)
15242 return -1;
15243 }
15244
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015245 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015246 PyErr_SetString(PyExc_TypeError,
15247 "not all arguments converted during string formatting");
15248 return -1;
15249 }
15250 return 0;
15251}
15252
Alexander Belopolsky40018472011-02-26 01:02:56 +000015253PyObject *
15254PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015255{
Victor Stinnera47082312012-10-04 02:19:54 +020015256 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015257
Guido van Rossumd57fd912000-03-10 22:53:23 +000015258 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015259 PyErr_BadInternalCall();
15260 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015261 }
Victor Stinnera47082312012-10-04 02:19:54 +020015262
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015263 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015264 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015265
15266 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015267 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15268 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15269 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15270 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015271
Victor Stinner8f674cc2013-04-17 23:02:17 +020015272 _PyUnicodeWriter_Init(&ctx.writer);
15273 ctx.writer.min_length = ctx.fmtcnt + 100;
15274 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015275
Guido van Rossumd57fd912000-03-10 22:53:23 +000015276 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015277 ctx.arglen = PyTuple_Size(args);
15278 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015279 }
15280 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015281 ctx.arglen = -1;
15282 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015283 }
Victor Stinnera47082312012-10-04 02:19:54 +020015284 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015285 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015286 ctx.dict = args;
15287 else
15288 ctx.dict = NULL;
15289 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015290
Victor Stinnera47082312012-10-04 02:19:54 +020015291 while (--ctx.fmtcnt >= 0) {
15292 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015293 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015294
15295 nonfmtpos = ctx.fmtpos++;
15296 while (ctx.fmtcnt >= 0 &&
15297 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15298 ctx.fmtpos++;
15299 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015300 }
Victor Stinnera47082312012-10-04 02:19:54 +020015301 if (ctx.fmtcnt < 0) {
15302 ctx.fmtpos--;
15303 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015304 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015305
Victor Stinnercfc4c132013-04-03 01:48:39 +020015306 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15307 nonfmtpos, ctx.fmtpos) < 0)
15308 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015309 }
15310 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015311 ctx.fmtpos++;
15312 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015313 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015314 }
15315 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015316
Victor Stinnera47082312012-10-04 02:19:54 +020015317 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015318 PyErr_SetString(PyExc_TypeError,
15319 "not all arguments converted during string formatting");
15320 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015321 }
15322
Victor Stinnera47082312012-10-04 02:19:54 +020015323 if (ctx.args_owned) {
15324 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015325 }
Victor Stinnera47082312012-10-04 02:19:54 +020015326 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015327
Benjamin Peterson29060642009-01-31 22:14:21 +000015328 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015329 _PyUnicodeWriter_Dealloc(&ctx.writer);
15330 if (ctx.args_owned) {
15331 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015332 }
15333 return NULL;
15334}
15335
Jeremy Hylton938ace62002-07-17 16:30:39 +000015336static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015337unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15338
Tim Peters6d6c1a32001-08-02 04:15:00 +000015339static PyObject *
15340unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15341{
Benjamin Peterson29060642009-01-31 22:14:21 +000015342 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015343 static char *kwlist[] = {"object", "encoding", "errors", 0};
15344 char *encoding = NULL;
15345 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015346
Benjamin Peterson14339b62009-01-31 16:36:08 +000015347 if (type != &PyUnicode_Type)
15348 return unicode_subtype_new(type, args, kwds);
15349 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015350 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015351 return NULL;
15352 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015353 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015354 if (encoding == NULL && errors == NULL)
15355 return PyObject_Str(x);
15356 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015357 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015358}
15359
Guido van Rossume023fe02001-08-30 03:12:59 +000015360static PyObject *
15361unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15362{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015363 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015364 Py_ssize_t length, char_size;
15365 int share_wstr, share_utf8;
15366 unsigned int kind;
15367 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015368
Benjamin Peterson14339b62009-01-31 16:36:08 +000015369 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015370
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015371 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015372 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015373 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015374 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015375 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015376 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015377 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015378 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015379
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015380 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015381 if (self == NULL) {
15382 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015383 return NULL;
15384 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015385 kind = PyUnicode_KIND(unicode);
15386 length = PyUnicode_GET_LENGTH(unicode);
15387
15388 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015389#ifdef Py_DEBUG
15390 _PyUnicode_HASH(self) = -1;
15391#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015392 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015393#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015394 _PyUnicode_STATE(self).interned = 0;
15395 _PyUnicode_STATE(self).kind = kind;
15396 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015397 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015398 _PyUnicode_STATE(self).ready = 1;
15399 _PyUnicode_WSTR(self) = NULL;
15400 _PyUnicode_UTF8_LENGTH(self) = 0;
15401 _PyUnicode_UTF8(self) = NULL;
15402 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015403 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015404
15405 share_utf8 = 0;
15406 share_wstr = 0;
15407 if (kind == PyUnicode_1BYTE_KIND) {
15408 char_size = 1;
15409 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15410 share_utf8 = 1;
15411 }
15412 else if (kind == PyUnicode_2BYTE_KIND) {
15413 char_size = 2;
15414 if (sizeof(wchar_t) == 2)
15415 share_wstr = 1;
15416 }
15417 else {
15418 assert(kind == PyUnicode_4BYTE_KIND);
15419 char_size = 4;
15420 if (sizeof(wchar_t) == 4)
15421 share_wstr = 1;
15422 }
15423
15424 /* Ensure we won't overflow the length. */
15425 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15426 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015427 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015428 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015429 data = PyObject_MALLOC((length + 1) * char_size);
15430 if (data == NULL) {
15431 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015432 goto onError;
15433 }
15434
Victor Stinnerc3c74152011-10-02 20:39:55 +020015435 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015436 if (share_utf8) {
15437 _PyUnicode_UTF8_LENGTH(self) = length;
15438 _PyUnicode_UTF8(self) = data;
15439 }
15440 if (share_wstr) {
15441 _PyUnicode_WSTR_LENGTH(self) = length;
15442 _PyUnicode_WSTR(self) = (wchar_t *)data;
15443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015444
Christian Heimesf051e432016-09-13 20:22:02 +020015445 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015446 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015447 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015448#ifdef Py_DEBUG
15449 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15450#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015451 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015452 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015453
15454onError:
15455 Py_DECREF(unicode);
15456 Py_DECREF(self);
15457 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015458}
15459
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015460PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015461"str(object='') -> str\n\
15462str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015463\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015464Create a new string object from the given object. If encoding or\n\
15465errors is specified, then the object must expose a data buffer\n\
15466that will be decoded using the given encoding and error handler.\n\
15467Otherwise, returns the result of object.__str__() (if defined)\n\
15468or repr(object).\n\
15469encoding defaults to sys.getdefaultencoding().\n\
15470errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015471
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015472static PyObject *unicode_iter(PyObject *seq);
15473
Guido van Rossumd57fd912000-03-10 22:53:23 +000015474PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015475 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015476 "str", /* tp_name */
15477 sizeof(PyUnicodeObject), /* tp_basicsize */
15478 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015479 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015480 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015481 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015482 0, /* tp_getattr */
15483 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015484 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015485 unicode_repr, /* tp_repr */
15486 &unicode_as_number, /* tp_as_number */
15487 &unicode_as_sequence, /* tp_as_sequence */
15488 &unicode_as_mapping, /* tp_as_mapping */
15489 (hashfunc) unicode_hash, /* tp_hash*/
15490 0, /* tp_call*/
15491 (reprfunc) unicode_str, /* tp_str */
15492 PyObject_GenericGetAttr, /* tp_getattro */
15493 0, /* tp_setattro */
15494 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015495 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015496 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15497 unicode_doc, /* tp_doc */
15498 0, /* tp_traverse */
15499 0, /* tp_clear */
15500 PyUnicode_RichCompare, /* tp_richcompare */
15501 0, /* tp_weaklistoffset */
15502 unicode_iter, /* tp_iter */
15503 0, /* tp_iternext */
15504 unicode_methods, /* tp_methods */
15505 0, /* tp_members */
15506 0, /* tp_getset */
15507 &PyBaseObject_Type, /* tp_base */
15508 0, /* tp_dict */
15509 0, /* tp_descr_get */
15510 0, /* tp_descr_set */
15511 0, /* tp_dictoffset */
15512 0, /* tp_init */
15513 0, /* tp_alloc */
15514 unicode_new, /* tp_new */
15515 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015516};
15517
15518/* Initialize the Unicode implementation */
15519
Victor Stinner331a6a52019-05-27 16:39:22 +020015520PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015521_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015522{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015523 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015524 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015525 0x000A, /* LINE FEED */
15526 0x000D, /* CARRIAGE RETURN */
15527 0x001C, /* FILE SEPARATOR */
15528 0x001D, /* GROUP SEPARATOR */
15529 0x001E, /* RECORD SEPARATOR */
15530 0x0085, /* NEXT LINE */
15531 0x2028, /* LINE SEPARATOR */
15532 0x2029, /* PARAGRAPH SEPARATOR */
15533 };
15534
Fred Drakee4315f52000-05-09 19:53:39 +000015535 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015536 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015537 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015538 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015539 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015540 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015541
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015542 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015543 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015544 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015545
15546 /* initialize the linebreak bloom filter */
15547 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015548 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015549 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015550
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015551 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015552 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015553 }
15554 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015555 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015556 }
15557 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015558 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015559 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015560 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015561}
15562
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015563
Walter Dörwald16807132007-05-25 13:52:07 +000015564void
15565PyUnicode_InternInPlace(PyObject **p)
15566{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015567 PyObject *s = *p;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015568#ifdef Py_DEBUG
15569 assert(s != NULL);
15570 assert(_PyUnicode_CHECK(s));
15571#else
Victor Stinner607b1022020-05-05 18:50:30 +020015572 if (s == NULL || !PyUnicode_Check(s)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020015573 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015574 }
Victor Stinner4fae54c2011-10-03 02:01:52 +020015575#endif
Victor Stinner607b1022020-05-05 18:50:30 +020015576
Benjamin Peterson14339b62009-01-31 16:36:08 +000015577 /* If it's a subclass, we don't really know what putting
15578 it in the interned dict might do. */
Victor Stinner607b1022020-05-05 18:50:30 +020015579 if (!PyUnicode_CheckExact(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015580 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015581 }
15582
15583 if (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015584 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015585 }
15586
15587#ifdef INTERNED_STRINGS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015588 if (interned == NULL) {
15589 interned = PyDict_New();
15590 if (interned == NULL) {
15591 PyErr_Clear(); /* Don't leave an exception */
15592 return;
15593 }
15594 }
Victor Stinner607b1022020-05-05 18:50:30 +020015595
15596 PyObject *t;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015597 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015598 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015599 Py_END_ALLOW_RECURSION
Victor Stinner607b1022020-05-05 18:50:30 +020015600
Berker Peksagced8d4c2016-07-25 04:40:39 +030015601 if (t == NULL) {
15602 PyErr_Clear();
15603 return;
15604 }
Victor Stinner607b1022020-05-05 18:50:30 +020015605
Berker Peksagced8d4c2016-07-25 04:40:39 +030015606 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015607 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015608 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015609 return;
15610 }
Victor Stinner607b1022020-05-05 18:50:30 +020015611
Benjamin Peterson14339b62009-01-31 16:36:08 +000015612 /* The two references in interned are not counted by refcnt.
15613 The deallocator will take care of this */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015614 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015615 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Victor Stinner607b1022020-05-05 18:50:30 +020015616#endif
Walter Dörwald16807132007-05-25 13:52:07 +000015617}
15618
15619void
15620PyUnicode_InternImmortal(PyObject **p)
15621{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015622 PyUnicode_InternInPlace(p);
15623 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015624 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015625 Py_INCREF(*p);
15626 }
Walter Dörwald16807132007-05-25 13:52:07 +000015627}
15628
15629PyObject *
15630PyUnicode_InternFromString(const char *cp)
15631{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015632 PyObject *s = PyUnicode_FromString(cp);
15633 if (s == NULL)
15634 return NULL;
15635 PyUnicode_InternInPlace(&s);
15636 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015637}
15638
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015639
15640#if defined(WITH_VALGRIND) || defined(__INSURE__)
15641static void
15642unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015643{
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015644 if (interned == NULL || !PyDict_Check(interned)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015645 return;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015646 }
15647 PyObject *keys = PyDict_Keys(interned);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015648 if (keys == NULL || !PyList_Check(keys)) {
15649 PyErr_Clear();
15650 return;
15651 }
Walter Dörwald16807132007-05-25 13:52:07 +000015652
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015653 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015654 detector, interned unicode strings are not forcibly deallocated;
15655 rather, we give them their stolen references back, and then clear
15656 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015657
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015658 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015659#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015660 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015661 n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015662
15663 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015664#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015665 for (Py_ssize_t i = 0; i < n; i++) {
15666 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015667 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015668 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015669 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015670 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015671 case SSTATE_INTERNED_IMMORTAL:
15672 Py_REFCNT(s) += 1;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015673#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015674 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015675#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015676 break;
15677 case SSTATE_INTERNED_MORTAL:
15678 Py_REFCNT(s) += 2;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015679#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015680 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015681#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015682 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015683 case SSTATE_NOT_INTERNED:
15684 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015685 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015686 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015687 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015688 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015689 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015690#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015691 fprintf(stderr, "total size of all interned strings: "
15692 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15693 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015694#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015695 Py_DECREF(keys);
15696 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015697 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015698}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015699#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015700
15701
15702/********************* Unicode Iterator **************************/
15703
15704typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015705 PyObject_HEAD
15706 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015707 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015708} unicodeiterobject;
15709
15710static void
15711unicodeiter_dealloc(unicodeiterobject *it)
15712{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015713 _PyObject_GC_UNTRACK(it);
15714 Py_XDECREF(it->it_seq);
15715 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015716}
15717
15718static int
15719unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15720{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015721 Py_VISIT(it->it_seq);
15722 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015723}
15724
15725static PyObject *
15726unicodeiter_next(unicodeiterobject *it)
15727{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015728 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015729
Benjamin Peterson14339b62009-01-31 16:36:08 +000015730 assert(it != NULL);
15731 seq = it->it_seq;
15732 if (seq == NULL)
15733 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015734 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015736 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15737 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015738 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015739 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15740 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015741 if (item != NULL)
15742 ++it->it_index;
15743 return item;
15744 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015745
Benjamin Peterson14339b62009-01-31 16:36:08 +000015746 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015747 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015748 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015749}
15750
15751static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015752unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015753{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015754 Py_ssize_t len = 0;
15755 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015756 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015757 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015758}
15759
15760PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15761
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015762static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015763unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015764{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015765 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015766 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015767 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015768 it->it_seq, it->it_index);
15769 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015770 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015771 if (u == NULL)
15772 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015773 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015774 }
15775}
15776
15777PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15778
15779static PyObject *
15780unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15781{
15782 Py_ssize_t index = PyLong_AsSsize_t(state);
15783 if (index == -1 && PyErr_Occurred())
15784 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015785 if (it->it_seq != NULL) {
15786 if (index < 0)
15787 index = 0;
15788 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15789 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15790 it->it_index = index;
15791 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015792 Py_RETURN_NONE;
15793}
15794
15795PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15796
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015797static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015798 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015799 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015800 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15801 reduce_doc},
15802 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15803 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015804 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015805};
15806
15807PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015808 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15809 "str_iterator", /* tp_name */
15810 sizeof(unicodeiterobject), /* tp_basicsize */
15811 0, /* tp_itemsize */
15812 /* methods */
15813 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015814 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015815 0, /* tp_getattr */
15816 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015817 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015818 0, /* tp_repr */
15819 0, /* tp_as_number */
15820 0, /* tp_as_sequence */
15821 0, /* tp_as_mapping */
15822 0, /* tp_hash */
15823 0, /* tp_call */
15824 0, /* tp_str */
15825 PyObject_GenericGetAttr, /* tp_getattro */
15826 0, /* tp_setattro */
15827 0, /* tp_as_buffer */
15828 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15829 0, /* tp_doc */
15830 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15831 0, /* tp_clear */
15832 0, /* tp_richcompare */
15833 0, /* tp_weaklistoffset */
15834 PyObject_SelfIter, /* tp_iter */
15835 (iternextfunc)unicodeiter_next, /* tp_iternext */
15836 unicodeiter_methods, /* tp_methods */
15837 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015838};
15839
15840static PyObject *
15841unicode_iter(PyObject *seq)
15842{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015843 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015844
Benjamin Peterson14339b62009-01-31 16:36:08 +000015845 if (!PyUnicode_Check(seq)) {
15846 PyErr_BadInternalCall();
15847 return NULL;
15848 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015849 if (PyUnicode_READY(seq) == -1)
15850 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015851 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15852 if (it == NULL)
15853 return NULL;
15854 it->it_index = 0;
15855 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015856 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015857 _PyObject_GC_TRACK(it);
15858 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015859}
15860
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015861
15862size_t
15863Py_UNICODE_strlen(const Py_UNICODE *u)
15864{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015865 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015866}
15867
15868Py_UNICODE*
15869Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15870{
15871 Py_UNICODE *u = s1;
15872 while ((*u++ = *s2++));
15873 return s1;
15874}
15875
15876Py_UNICODE*
15877Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15878{
15879 Py_UNICODE *u = s1;
15880 while ((*u++ = *s2++))
15881 if (n-- == 0)
15882 break;
15883 return s1;
15884}
15885
15886Py_UNICODE*
15887Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15888{
15889 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015890 u1 += wcslen(u1);
15891 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015892 return s1;
15893}
15894
15895int
15896Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15897{
15898 while (*s1 && *s2 && *s1 == *s2)
15899 s1++, s2++;
15900 if (*s1 && *s2)
15901 return (*s1 < *s2) ? -1 : +1;
15902 if (*s1)
15903 return 1;
15904 if (*s2)
15905 return -1;
15906 return 0;
15907}
15908
15909int
15910Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15911{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015912 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015913 for (; n != 0; n--) {
15914 u1 = *s1;
15915 u2 = *s2;
15916 if (u1 != u2)
15917 return (u1 < u2) ? -1 : +1;
15918 if (u1 == '\0')
15919 return 0;
15920 s1++;
15921 s2++;
15922 }
15923 return 0;
15924}
15925
15926Py_UNICODE*
15927Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15928{
15929 const Py_UNICODE *p;
15930 for (p = s; *p; p++)
15931 if (*p == c)
15932 return (Py_UNICODE*)p;
15933 return NULL;
15934}
15935
15936Py_UNICODE*
15937Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15938{
15939 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015940 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015941 while (p != s) {
15942 p--;
15943 if (*p == c)
15944 return (Py_UNICODE*)p;
15945 }
15946 return NULL;
15947}
Victor Stinner331ea922010-08-10 16:37:20 +000015948
Victor Stinner71133ff2010-09-01 23:43:53 +000015949Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015950PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015951{
Victor Stinner577db2c2011-10-11 22:12:48 +020015952 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015953 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015954
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015955 if (!PyUnicode_Check(unicode)) {
15956 PyErr_BadArgument();
15957 return NULL;
15958 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015959 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015960 if (u == NULL)
15961 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015962 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015963 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015964 PyErr_NoMemory();
15965 return NULL;
15966 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015967 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015968 size *= sizeof(Py_UNICODE);
15969 copy = PyMem_Malloc(size);
15970 if (copy == NULL) {
15971 PyErr_NoMemory();
15972 return NULL;
15973 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015974 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015975 return copy;
15976}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015977
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015978
Victor Stinner709d23d2019-05-02 14:56:30 -040015979static int
15980encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015981{
Victor Stinner709d23d2019-05-02 14:56:30 -040015982 int res;
15983 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15984 if (res == -2) {
15985 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15986 return -1;
15987 }
15988 if (res < 0) {
15989 PyErr_NoMemory();
15990 return -1;
15991 }
15992 return 0;
15993}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015994
Victor Stinner709d23d2019-05-02 14:56:30 -040015995
15996static int
15997config_get_codec_name(wchar_t **config_encoding)
15998{
15999 char *encoding;
16000 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16001 return -1;
16002 }
16003
16004 PyObject *name_obj = NULL;
16005 PyObject *codec = _PyCodec_Lookup(encoding);
16006 PyMem_RawFree(encoding);
16007
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016008 if (!codec)
16009 goto error;
16010
16011 name_obj = PyObject_GetAttrString(codec, "name");
16012 Py_CLEAR(codec);
16013 if (!name_obj) {
16014 goto error;
16015 }
16016
Victor Stinner709d23d2019-05-02 14:56:30 -040016017 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16018 Py_DECREF(name_obj);
16019 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016020 goto error;
16021 }
16022
Victor Stinner709d23d2019-05-02 14:56:30 -040016023 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16024 if (raw_wname == NULL) {
16025 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016026 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040016027 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016028 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016029
16030 PyMem_RawFree(*config_encoding);
16031 *config_encoding = raw_wname;
16032
16033 PyMem_Free(wname);
16034 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016035
16036error:
16037 Py_XDECREF(codec);
16038 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040016039 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016040}
16041
16042
Victor Stinner331a6a52019-05-27 16:39:22 +020016043static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016044init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016045{
Victor Stinner709d23d2019-05-02 14:56:30 -040016046 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016047 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(tstate->interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016048 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016049 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016050 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016051 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016052 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016053}
16054
16055
Victor Stinner709d23d2019-05-02 14:56:30 -040016056static int
16057init_fs_codec(PyInterpreterState *interp)
16058{
Victor Stinnerda7933e2020-04-13 03:04:28 +020016059 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016060
16061 _Py_error_handler error_handler;
16062 error_handler = get_error_handler_wide(config->filesystem_errors);
16063 if (error_handler == _Py_ERROR_UNKNOWN) {
16064 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
16065 return -1;
16066 }
16067
16068 char *encoding, *errors;
16069 if (encode_wstr_utf8(config->filesystem_encoding,
16070 &encoding,
16071 "filesystem_encoding") < 0) {
16072 return -1;
16073 }
16074
16075 if (encode_wstr_utf8(config->filesystem_errors,
16076 &errors,
16077 "filesystem_errors") < 0) {
16078 PyMem_RawFree(encoding);
16079 return -1;
16080 }
16081
Victor Stinner3d17c042020-05-14 01:48:38 +020016082 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16083 PyMem_RawFree(fs_codec->encoding);
16084 fs_codec->encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016085 /* encoding has been normalized by init_fs_encoding() */
Victor Stinner3d17c042020-05-14 01:48:38 +020016086 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16087 PyMem_RawFree(fs_codec->errors);
16088 fs_codec->errors = errors;
16089 fs_codec->error_handler = error_handler;
Victor Stinner709d23d2019-05-02 14:56:30 -040016090
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016091#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +020016092 assert(fs_codec->utf8 == 1);
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016093#endif
16094
Victor Stinner709d23d2019-05-02 14:56:30 -040016095 /* At this point, PyUnicode_EncodeFSDefault() and
16096 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16097 the C implementation of the filesystem encoding. */
16098
16099 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16100 global configuration variables. */
Victor Stinner3d17c042020-05-14 01:48:38 +020016101 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16102 fs_codec->errors) < 0) {
Victor Stinner709d23d2019-05-02 14:56:30 -040016103 PyErr_NoMemory();
16104 return -1;
16105 }
16106 return 0;
16107}
16108
16109
Victor Stinner331a6a52019-05-27 16:39:22 +020016110static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016111init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016112{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016113 PyInterpreterState *interp = tstate->interp;
16114
Victor Stinner709d23d2019-05-02 14:56:30 -040016115 /* Update the filesystem encoding to the normalized Python codec name.
16116 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16117 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016118 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016119 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016120 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016121 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016122 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016123 }
16124
Victor Stinner709d23d2019-05-02 14:56:30 -040016125 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016126 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016127 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016128 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016129}
16130
16131
Victor Stinner331a6a52019-05-27 16:39:22 +020016132PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020016133_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016134{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016135 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016136 if (_PyStatus_EXCEPTION(status)) {
16137 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016138 }
16139
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016140 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016141}
16142
16143
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016144static void
Victor Stinner3d17c042020-05-14 01:48:38 +020016145_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016146{
Victor Stinner3d17c042020-05-14 01:48:38 +020016147 PyMem_RawFree(fs_codec->encoding);
16148 fs_codec->encoding = NULL;
16149 fs_codec->utf8 = 0;
16150 PyMem_RawFree(fs_codec->errors);
16151 fs_codec->errors = NULL;
16152 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016153}
16154
16155
Victor Stinner709d23d2019-05-02 14:56:30 -040016156#ifdef MS_WINDOWS
16157int
16158_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16159{
Victor Stinner81a7be32020-04-14 15:14:01 +020016160 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016161 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016162
16163 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16164 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16165 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16166 if (encoding == NULL || errors == NULL) {
16167 PyMem_RawFree(encoding);
16168 PyMem_RawFree(errors);
16169 PyErr_NoMemory();
16170 return -1;
16171 }
16172
16173 PyMem_RawFree(config->filesystem_encoding);
16174 config->filesystem_encoding = encoding;
16175 PyMem_RawFree(config->filesystem_errors);
16176 config->filesystem_errors = errors;
16177
16178 return init_fs_codec(interp);
16179}
16180#endif
16181
16182
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016183void
Victor Stinner3d483342019-11-22 12:27:50 +010016184_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016185{
Victor Stinner3d483342019-11-22 12:27:50 +010016186 if (_Py_IsMainInterpreter(tstate)) {
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016187#if defined(WITH_VALGRIND) || defined(__INSURE__)
Victor Stinner3d483342019-11-22 12:27:50 +010016188 /* Insure++ is a memory analysis tool that aids in discovering
16189 * memory leaks and other memory problems. On Python exit, the
16190 * interned string dictionaries are flagged as being in use at exit
16191 * (which it is). Under normal circumstances, this is fine because
16192 * the memory will be automatically reclaimed by the system. Under
16193 * memory debugging, it's a huge source of useless noise, so we
16194 * trade off slower shutdown for less distraction in the memory
16195 * reports. -baw
16196 */
16197 unicode_release_interned();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016198#endif /* __INSURE__ */
16199
Victor Stinner3d483342019-11-22 12:27:50 +010016200 Py_CLEAR(unicode_empty);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016201
Victor Stinner607b1022020-05-05 18:50:30 +020016202#ifdef LATIN1_SINGLETONS
Victor Stinner3d483342019-11-22 12:27:50 +010016203 for (Py_ssize_t i = 0; i < 256; i++) {
16204 Py_CLEAR(unicode_latin1[i]);
16205 }
Victor Stinner607b1022020-05-05 18:50:30 +020016206#endif
Victor Stinnerd6fb53f2020-05-14 01:11:54 +020016207 unicode_clear_static_strings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016208 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016209
Victor Stinner3d17c042020-05-14 01:48:38 +020016210 _PyUnicode_FiniEncodings(&tstate->interp->unicode.fs_codec);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016211}
16212
16213
Georg Brandl66c221e2010-10-14 07:04:07 +000016214/* A _string module, to export formatter_parser and formatter_field_name_split
16215 to the string.Formatter class implemented in Python. */
16216
16217static PyMethodDef _string_methods[] = {
16218 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16219 METH_O, PyDoc_STR("split the argument as a field name")},
16220 {"formatter_parser", (PyCFunction) formatter_parser,
16221 METH_O, PyDoc_STR("parse the argument as a format string")},
16222 {NULL, NULL}
16223};
16224
16225static struct PyModuleDef _string_module = {
16226 PyModuleDef_HEAD_INIT,
16227 "_string",
16228 PyDoc_STR("string helper module"),
16229 0,
16230 _string_methods,
16231 NULL,
16232 NULL,
16233 NULL,
16234 NULL
16235};
16236
16237PyMODINIT_FUNC
16238PyInit__string(void)
16239{
16240 return PyModule_Create(&_string_module);
16241}
16242
16243
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016244#ifdef __cplusplus
16245}
16246#endif