blob: ea46a44bf5faacb27b6ec5dd90cb71eb6bfe927a [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020043#include "pycore_abstract.h" // _PyIndex_Check()
Victor Stinner45876a92020-02-12 22:32:34 +010044#include "pycore_bytes_methods.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010045#include "pycore_fileutils.h"
Victor Stinner61691d82019-10-02 23:51:20 +020046#include "pycore_initconfig.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020047#include "pycore_interp.h" // PyInterpreterState.fs_codec
Victor Stinnerbcda8f12018-11-21 22:27:47 +010048#include "pycore_object.h"
Victor Stinner61691d82019-10-02 23:51:20 +020049#include "pycore_pathconfig.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040050#include "pycore_pylifecycle.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020051#include "pycore_pystate.h" // _PyInterpreterState_GET()
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000052#include "ucnhash.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070053#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000054
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000055#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000056#include <windows.h>
57#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000058
Victor Stinnerfecc4f22019-03-19 14:20:29 +010059/* Uncomment to display statistics on interned strings at exit when
60 using Valgrind or Insecure++. */
61/* #define INTERNED_STATS 1 */
62
63
Larry Hastings61272b72014-01-07 12:41:53 -080064/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090065class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080066[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090067/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
68
69/*[python input]
70class Py_UCS4_converter(CConverter):
71 type = 'Py_UCS4'
72 converter = 'convert_uc'
73
74 def converter_init(self):
75 if self.default is not unspecified:
76 self.c_default = ascii(self.default)
77 if len(self.c_default) > 4 or self.c_default[0] != "'":
78 self.c_default = hex(ord(self.default))
79
80[python start generated code]*/
81/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080082
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
Serhiy Storchaka05997252013-01-26 12:14:02 +020085NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner8faf8212011-12-08 22:14:11 +010096/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
97#define MAX_UNICODE 0x10ffff
98
Victor Stinner910337b2011-10-03 03:20:16 +020099#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200100# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#else
102# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
103#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200104
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200105#define _PyUnicode_UTF8(op) \
106 (((PyCompactUnicodeObject*)(op))->utf8)
107#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200108 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200109 assert(PyUnicode_IS_READY(op)), \
110 PyUnicode_IS_COMPACT_ASCII(op) ? \
111 ((char*)((PyASCIIObject*)(op) + 1)) : \
112 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200113#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 (((PyCompactUnicodeObject*)(op))->utf8_length)
115#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200116 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200117 assert(PyUnicode_IS_READY(op)), \
118 PyUnicode_IS_COMPACT_ASCII(op) ? \
119 ((PyASCIIObject*)(op))->length : \
120 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200121#define _PyUnicode_WSTR(op) \
122 (((PyASCIIObject*)(op))->wstr)
123#define _PyUnicode_WSTR_LENGTH(op) \
124 (((PyCompactUnicodeObject*)(op))->wstr_length)
125#define _PyUnicode_LENGTH(op) \
126 (((PyASCIIObject *)(op))->length)
127#define _PyUnicode_STATE(op) \
128 (((PyASCIIObject *)(op))->state)
129#define _PyUnicode_HASH(op) \
130 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_KIND(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200134#define _PyUnicode_GET_LENGTH(op) \
135 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200137#define _PyUnicode_DATA_ANY(op) \
138 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200139
Victor Stinner910337b2011-10-03 03:20:16 +0200140#undef PyUnicode_READY
141#define PyUnicode_READY(op) \
142 (assert(_PyUnicode_CHECK(op)), \
143 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200144 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100145 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200146
Victor Stinnerc379ead2011-10-03 12:52:27 +0200147#define _PyUnicode_SHARE_UTF8(op) \
148 (assert(_PyUnicode_CHECK(op)), \
149 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
150 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
151#define _PyUnicode_SHARE_WSTR(op) \
152 (assert(_PyUnicode_CHECK(op)), \
153 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
154
Victor Stinner829c0ad2011-10-03 01:08:02 +0200155/* true if the Unicode object has an allocated UTF-8 memory block
156 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200157#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200158 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200159 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200160 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
161
Victor Stinner03490912011-10-03 23:45:12 +0200162/* true if the Unicode object has an allocated wstr memory block
163 (not shared with other data) */
164#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200165 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200166 (!PyUnicode_IS_READY(op) || \
167 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
168
Victor Stinner910337b2011-10-03 03:20:16 +0200169/* Generic helper macro to convert characters of different types.
170 from_type and to_type have to be valid type names, begin and end
171 are pointers to the source characters which should be of type
172 "from_type *". to is a pointer of type "to_type *" and points to the
173 buffer where the result characters are written to. */
174#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
175 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100176 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600177 const from_type *_iter = (const from_type *)(begin);\
178 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200179 Py_ssize_t n = (_end) - (_iter); \
180 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200181 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200182 while (_iter < (_unrolled_end)) { \
183 _to[0] = (to_type) _iter[0]; \
184 _to[1] = (to_type) _iter[1]; \
185 _to[2] = (to_type) _iter[2]; \
186 _to[3] = (to_type) _iter[3]; \
187 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200188 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200189 while (_iter < (_end)) \
190 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200191 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200192
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200193#ifdef MS_WINDOWS
194 /* On Windows, overallocate by 50% is the best factor */
195# define OVERALLOCATE_FACTOR 2
196#else
197 /* On Linux, overallocate by 25% is the best factor */
198# define OVERALLOCATE_FACTOR 4
199#endif
200
Victor Stinner607b1022020-05-05 18:50:30 +0200201/* bpo-40521: Interned strings are shared by all interpreters. */
202#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
203# define INTERNED_STRINGS
204#endif
205
Walter Dörwald16807132007-05-25 13:52:07 +0000206/* This dictionary holds all interned unicode strings. Note that references
207 to strings in this dictionary are *not* counted in the string's ob_refcnt.
208 When the interned string reaches a refcnt of 0 the string deallocation
209 function will delete the reference from this dictionary.
210
211 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000212 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000213*/
Victor Stinner607b1022020-05-05 18:50:30 +0200214#ifdef INTERNED_STRINGS
Serhiy Storchaka05997252013-01-26 12:14:02 +0200215static PyObject *interned = NULL;
Victor Stinner607b1022020-05-05 18:50:30 +0200216#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000217
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000218/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200219static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200220
Serhiy Storchaka678db842013-01-26 12:16:36 +0200221#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222 do { \
223 if (unicode_empty != NULL) \
224 Py_INCREF(unicode_empty); \
225 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200226 unicode_empty = PyUnicode_New(0, 0); \
227 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200228 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200229 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
230 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200231 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200232 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000233
Serhiy Storchaka678db842013-01-26 12:16:36 +0200234#define _Py_RETURN_UNICODE_EMPTY() \
235 do { \
236 _Py_INCREF_UNICODE_EMPTY(); \
237 return unicode_empty; \
238 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000239
Victor Stinner59423e32018-11-26 13:40:01 +0100240static inline void
241unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
242 Py_ssize_t start, Py_ssize_t length)
243{
244 assert(0 <= start);
245 assert(kind != PyUnicode_WCHAR_KIND);
246 switch (kind) {
247 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100248 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100249 Py_UCS1 ch = (unsigned char)value;
250 Py_UCS1 *to = (Py_UCS1 *)data + start;
251 memset(to, ch, length);
252 break;
253 }
254 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100255 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100256 Py_UCS2 ch = (Py_UCS2)value;
257 Py_UCS2 *to = (Py_UCS2 *)data + start;
258 const Py_UCS2 *end = to + length;
259 for (; to < end; ++to) *to = ch;
260 break;
261 }
262 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100263 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100264 Py_UCS4 ch = value;
265 Py_UCS4 * to = (Py_UCS4 *)data + start;
266 const Py_UCS4 *end = to + length;
267 for (; to < end; ++to) *to = ch;
268 break;
269 }
270 default: Py_UNREACHABLE();
271 }
272}
273
274
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200275/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700276static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200277_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900278static inline void
279_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400280static PyObject *
281unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
282 const char *errors);
283static PyObject *
284unicode_decode_utf8(const char *s, Py_ssize_t size,
285 _Py_error_handler error_handler, const char *errors,
286 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200287
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200288/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200289static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200290
Victor Stinner607b1022020-05-05 18:50:30 +0200291/* bpo-40521: Latin1 singletons are shared by all interpreters. */
292#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
293# define LATIN1_SINGLETONS
294#endif
295
296#ifdef LATIN1_SINGLETONS
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000297/* Single character Unicode strings in the Latin-1 range are being
298 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200299static PyObject *unicode_latin1[256] = {NULL};
Victor Stinner607b1022020-05-05 18:50:30 +0200300#endif
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000301
Christian Heimes190d79e2008-01-30 11:58:22 +0000302/* Fast detection of the most frequent whitespace characters */
303const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000304 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000305/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000306/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000307/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000308/* case 0x000C: * FORM FEED */
309/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000310 0, 1, 1, 1, 1, 1, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000312/* case 0x001C: * FILE SEPARATOR */
313/* case 0x001D: * GROUP SEPARATOR */
314/* case 0x001E: * RECORD SEPARATOR */
315/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000316 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000317/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000318 1, 0, 0, 0, 0, 0, 0, 0,
319 0, 0, 0, 0, 0, 0, 0, 0,
320 0, 0, 0, 0, 0, 0, 0, 0,
321 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000322
Benjamin Peterson14339b62009-01-31 16:36:08 +0000323 0, 0, 0, 0, 0, 0, 0, 0,
324 0, 0, 0, 0, 0, 0, 0, 0,
325 0, 0, 0, 0, 0, 0, 0, 0,
326 0, 0, 0, 0, 0, 0, 0, 0,
327 0, 0, 0, 0, 0, 0, 0, 0,
328 0, 0, 0, 0, 0, 0, 0, 0,
329 0, 0, 0, 0, 0, 0, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000331};
332
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200333/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200334static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200335static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100336static int unicode_modifiable(PyObject *unicode);
337
Victor Stinnerfe226c02011-10-03 03:52:20 +0200338
Alexander Belopolsky40018472011-02-26 01:02:56 +0000339static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100340_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200341static PyObject *
342_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
343static PyObject *
344_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
345
346static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000347unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000348 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100349 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000350 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
351
Alexander Belopolsky40018472011-02-26 01:02:56 +0000352static void
353raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300354 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100355 PyObject *unicode,
356 Py_ssize_t startpos, Py_ssize_t endpos,
357 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000358
Christian Heimes190d79e2008-01-30 11:58:22 +0000359/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200360static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000361 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000362/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000363/* 0x000B, * LINE TABULATION */
364/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000365/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000366 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000367 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000368/* 0x001C, * FILE SEPARATOR */
369/* 0x001D, * GROUP SEPARATOR */
370/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000371 0, 0, 0, 0, 1, 1, 1, 0,
372 0, 0, 0, 0, 0, 0, 0, 0,
373 0, 0, 0, 0, 0, 0, 0, 0,
374 0, 0, 0, 0, 0, 0, 0, 0,
375 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000376
Benjamin Peterson14339b62009-01-31 16:36:08 +0000377 0, 0, 0, 0, 0, 0, 0, 0,
378 0, 0, 0, 0, 0, 0, 0, 0,
379 0, 0, 0, 0, 0, 0, 0, 0,
380 0, 0, 0, 0, 0, 0, 0, 0,
381 0, 0, 0, 0, 0, 0, 0, 0,
382 0, 0, 0, 0, 0, 0, 0, 0,
383 0, 0, 0, 0, 0, 0, 0, 0,
384 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000385};
386
INADA Naoki3ae20562017-01-16 20:41:20 +0900387static int convert_uc(PyObject *obj, void *addr);
388
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300389#include "clinic/unicodeobject.c.h"
390
Victor Stinner3d4226a2018-08-29 22:21:32 +0200391_Py_error_handler
392_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200393{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200394 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200395 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200396 }
397 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200398 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200399 }
400 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200401 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200402 }
403 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200404 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200405 }
406 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200407 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200408 }
409 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200410 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200411 }
412 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200413 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200414 }
Victor Stinner50149202015-09-22 00:26:54 +0200415 return _Py_ERROR_OTHER;
416}
417
Victor Stinner709d23d2019-05-02 14:56:30 -0400418
419static _Py_error_handler
420get_error_handler_wide(const wchar_t *errors)
421{
422 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
423 return _Py_ERROR_STRICT;
424 }
425 if (wcscmp(errors, L"surrogateescape") == 0) {
426 return _Py_ERROR_SURROGATEESCAPE;
427 }
428 if (wcscmp(errors, L"replace") == 0) {
429 return _Py_ERROR_REPLACE;
430 }
431 if (wcscmp(errors, L"ignore") == 0) {
432 return _Py_ERROR_IGNORE;
433 }
434 if (wcscmp(errors, L"backslashreplace") == 0) {
435 return _Py_ERROR_BACKSLASHREPLACE;
436 }
437 if (wcscmp(errors, L"surrogatepass") == 0) {
438 return _Py_ERROR_SURROGATEPASS;
439 }
440 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
441 return _Py_ERROR_XMLCHARREFREPLACE;
442 }
443 return _Py_ERROR_OTHER;
444}
445
446
Victor Stinner22eb6892019-06-26 00:51:05 +0200447static inline int
448unicode_check_encoding_errors(const char *encoding, const char *errors)
449{
450 if (encoding == NULL && errors == NULL) {
451 return 0;
452 }
453
Victor Stinner81a7be32020-04-14 15:14:01 +0200454 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200455#ifndef Py_DEBUG
456 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200457 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200458 return 0;
459 }
460#else
461 /* Always check in debug mode */
462#endif
463
464 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
465 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
Victor Stinner3d17c042020-05-14 01:48:38 +0200466 if (!interp->unicode.fs_codec.encoding) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200467 return 0;
468 }
469
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200470 /* Disable checks during Python finalization. For example, it allows to
471 call _PyObject_Dump() during finalization for debugging purpose. */
472 if (interp->finalizing) {
473 return 0;
474 }
475
Victor Stinner22eb6892019-06-26 00:51:05 +0200476 if (encoding != NULL) {
477 PyObject *handler = _PyCodec_Lookup(encoding);
478 if (handler == NULL) {
479 return -1;
480 }
481 Py_DECREF(handler);
482 }
483
484 if (errors != NULL) {
485 PyObject *handler = PyCodec_LookupError(errors);
486 if (handler == NULL) {
487 return -1;
488 }
489 Py_DECREF(handler);
490 }
491 return 0;
492}
493
494
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300495/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
496 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000497Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000498PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000499{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000500#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000501 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000502#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000503 /* This is actually an illegal character, so it should
504 not be passed to unichr. */
505 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000506#endif
507}
508
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200509int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100510_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200511{
Victor Stinner68762572019-10-07 18:42:01 +0200512#define CHECK(expr) \
513 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
514
Victor Stinner910337b2011-10-03 03:20:16 +0200515 PyASCIIObject *ascii;
516 unsigned int kind;
517
Victor Stinner68762572019-10-07 18:42:01 +0200518 assert(op != NULL);
519 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200520
521 ascii = (PyASCIIObject *)op;
522 kind = ascii->state.kind;
523
Victor Stinnera3b334d2011-10-03 13:53:37 +0200524 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200525 CHECK(kind == PyUnicode_1BYTE_KIND);
526 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200527 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200528 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200529 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200530 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200531
Victor Stinnera41463c2011-10-04 01:05:08 +0200532 if (ascii->state.compact == 1) {
533 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200534 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200535 || kind == PyUnicode_2BYTE_KIND
536 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200537 CHECK(ascii->state.ascii == 0);
538 CHECK(ascii->state.ready == 1);
539 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100540 }
541 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200542 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
543
544 data = unicode->data.any;
545 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200546 CHECK(ascii->length == 0);
547 CHECK(ascii->hash == -1);
548 CHECK(ascii->state.compact == 0);
549 CHECK(ascii->state.ascii == 0);
550 CHECK(ascii->state.ready == 0);
551 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
552 CHECK(ascii->wstr != NULL);
553 CHECK(data == NULL);
554 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200555 }
556 else {
Victor Stinner68762572019-10-07 18:42:01 +0200557 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200558 || kind == PyUnicode_2BYTE_KIND
559 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200560 CHECK(ascii->state.compact == 0);
561 CHECK(ascii->state.ready == 1);
562 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200563 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200564 CHECK(compact->utf8 == data);
565 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200566 }
567 else
Victor Stinner68762572019-10-07 18:42:01 +0200568 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200569 }
570 }
571 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200572 if (
573#if SIZEOF_WCHAR_T == 2
574 kind == PyUnicode_2BYTE_KIND
575#else
576 kind == PyUnicode_4BYTE_KIND
577#endif
578 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200579 {
Victor Stinner68762572019-10-07 18:42:01 +0200580 CHECK(ascii->wstr == data);
581 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200582 } else
Victor Stinner68762572019-10-07 18:42:01 +0200583 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200584 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200585
586 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200587 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200588 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200589 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200590 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200591
592 /* check that the best kind is used: O(n) operation */
593 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200594 Py_ssize_t i;
595 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300596 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200597 Py_UCS4 ch;
598
599 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200600 for (i=0; i < ascii->length; i++)
601 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200602 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200603 if (ch > maxchar)
604 maxchar = ch;
605 }
606 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100607 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200608 CHECK(maxchar >= 128);
609 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100610 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200611 else
Victor Stinner68762572019-10-07 18:42:01 +0200612 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200613 }
Victor Stinner77faf692011-11-20 18:56:05 +0100614 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200615 CHECK(maxchar >= 0x100);
616 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100617 }
618 else {
Victor Stinner68762572019-10-07 18:42:01 +0200619 CHECK(maxchar >= 0x10000);
620 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100621 }
Victor Stinner68762572019-10-07 18:42:01 +0200622 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200623 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400624 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200625
626#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400627}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200628
Victor Stinner910337b2011-10-03 03:20:16 +0200629
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100630static PyObject*
631unicode_result_wchar(PyObject *unicode)
632{
633#ifndef Py_DEBUG
634 Py_ssize_t len;
635
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100636 len = _PyUnicode_WSTR_LENGTH(unicode);
637 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100638 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200639 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100640 }
641
642 if (len == 1) {
643 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100644 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100645 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
646 Py_DECREF(unicode);
647 return latin1_char;
648 }
649 }
650
651 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200652 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100653 return NULL;
654 }
655#else
Victor Stinneraa771272012-10-04 02:32:58 +0200656 assert(Py_REFCNT(unicode) == 1);
657
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100658 /* don't make the result ready in debug mode to ensure that the caller
659 makes the string ready before using it */
660 assert(_PyUnicode_CheckConsistency(unicode, 1));
661#endif
662 return unicode;
663}
664
665static PyObject*
666unicode_result_ready(PyObject *unicode)
667{
668 Py_ssize_t length;
669
670 length = PyUnicode_GET_LENGTH(unicode);
671 if (length == 0) {
672 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100673 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200674 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100675 }
676 return unicode_empty;
677 }
678
Victor Stinner607b1022020-05-05 18:50:30 +0200679#ifdef LATIN1_SINGLETONS
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100680 if (length == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300681 const void *data = PyUnicode_DATA(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +0200682 int kind = PyUnicode_KIND(unicode);
683 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100684 if (ch < 256) {
685 PyObject *latin1_char = unicode_latin1[ch];
686 if (latin1_char != NULL) {
687 if (unicode != latin1_char) {
688 Py_INCREF(latin1_char);
689 Py_DECREF(unicode);
690 }
691 return latin1_char;
692 }
693 else {
694 assert(_PyUnicode_CheckConsistency(unicode, 1));
695 Py_INCREF(unicode);
696 unicode_latin1[ch] = unicode;
697 return unicode;
698 }
699 }
700 }
Victor Stinner607b1022020-05-05 18:50:30 +0200701#endif
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100702
703 assert(_PyUnicode_CheckConsistency(unicode, 1));
704 return unicode;
705}
706
707static PyObject*
708unicode_result(PyObject *unicode)
709{
710 assert(_PyUnicode_CHECK(unicode));
711 if (PyUnicode_IS_READY(unicode))
712 return unicode_result_ready(unicode);
713 else
714 return unicode_result_wchar(unicode);
715}
716
Victor Stinnerc4b49542011-12-11 22:44:26 +0100717static PyObject*
718unicode_result_unchanged(PyObject *unicode)
719{
720 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500721 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100722 return NULL;
723 Py_INCREF(unicode);
724 return unicode;
725 }
726 else
727 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100728 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100729}
730
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200731/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
732 ASCII, Latin1, UTF-8, etc. */
733static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200734backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200735 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
736{
Victor Stinnerad771582015-10-09 12:38:53 +0200737 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200738 Py_UCS4 ch;
739 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300740 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200741
742 assert(PyUnicode_IS_READY(unicode));
743 kind = PyUnicode_KIND(unicode);
744 data = PyUnicode_DATA(unicode);
745
746 size = 0;
747 /* determine replacement size */
748 for (i = collstart; i < collend; ++i) {
749 Py_ssize_t incr;
750
751 ch = PyUnicode_READ(kind, data, i);
752 if (ch < 0x100)
753 incr = 2+2;
754 else if (ch < 0x10000)
755 incr = 2+4;
756 else {
757 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200758 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200759 }
760 if (size > PY_SSIZE_T_MAX - incr) {
761 PyErr_SetString(PyExc_OverflowError,
762 "encoded result is too long for a Python string");
763 return NULL;
764 }
765 size += incr;
766 }
767
Victor Stinnerad771582015-10-09 12:38:53 +0200768 str = _PyBytesWriter_Prepare(writer, str, size);
769 if (str == NULL)
770 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200771
772 /* generate replacement */
773 for (i = collstart; i < collend; ++i) {
774 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200775 *str++ = '\\';
776 if (ch >= 0x00010000) {
777 *str++ = 'U';
778 *str++ = Py_hexdigits[(ch>>28)&0xf];
779 *str++ = Py_hexdigits[(ch>>24)&0xf];
780 *str++ = Py_hexdigits[(ch>>20)&0xf];
781 *str++ = Py_hexdigits[(ch>>16)&0xf];
782 *str++ = Py_hexdigits[(ch>>12)&0xf];
783 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200784 }
Victor Stinner797485e2015-10-09 03:17:30 +0200785 else if (ch >= 0x100) {
786 *str++ = 'u';
787 *str++ = Py_hexdigits[(ch>>12)&0xf];
788 *str++ = Py_hexdigits[(ch>>8)&0xf];
789 }
790 else
791 *str++ = 'x';
792 *str++ = Py_hexdigits[(ch>>4)&0xf];
793 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200794 }
795 return str;
796}
797
798/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
799 ASCII, Latin1, UTF-8, etc. */
800static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200801xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200802 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
803{
Victor Stinnerad771582015-10-09 12:38:53 +0200804 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200805 Py_UCS4 ch;
806 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300807 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200808
809 assert(PyUnicode_IS_READY(unicode));
810 kind = PyUnicode_KIND(unicode);
811 data = PyUnicode_DATA(unicode);
812
813 size = 0;
814 /* determine replacement size */
815 for (i = collstart; i < collend; ++i) {
816 Py_ssize_t incr;
817
818 ch = PyUnicode_READ(kind, data, i);
819 if (ch < 10)
820 incr = 2+1+1;
821 else if (ch < 100)
822 incr = 2+2+1;
823 else if (ch < 1000)
824 incr = 2+3+1;
825 else if (ch < 10000)
826 incr = 2+4+1;
827 else if (ch < 100000)
828 incr = 2+5+1;
829 else if (ch < 1000000)
830 incr = 2+6+1;
831 else {
832 assert(ch <= MAX_UNICODE);
833 incr = 2+7+1;
834 }
835 if (size > PY_SSIZE_T_MAX - incr) {
836 PyErr_SetString(PyExc_OverflowError,
837 "encoded result is too long for a Python string");
838 return NULL;
839 }
840 size += incr;
841 }
842
Victor Stinnerad771582015-10-09 12:38:53 +0200843 str = _PyBytesWriter_Prepare(writer, str, size);
844 if (str == NULL)
845 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200846
847 /* generate replacement */
848 for (i = collstart; i < collend; ++i) {
849 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
850 }
851 return str;
852}
853
Thomas Wouters477c8d52006-05-27 19:21:47 +0000854/* --- Bloom Filters ----------------------------------------------------- */
855
856/* stuff to implement simple "bloom filters" for Unicode characters.
857 to keep things simple, we use a single bitmask, using the least 5
858 bits from each unicode characters as the bit index. */
859
860/* the linebreak mask is set up by Unicode_Init below */
861
Antoine Pitrouf068f942010-01-13 14:19:12 +0000862#if LONG_BIT >= 128
863#define BLOOM_WIDTH 128
864#elif LONG_BIT >= 64
865#define BLOOM_WIDTH 64
866#elif LONG_BIT >= 32
867#define BLOOM_WIDTH 32
868#else
869#error "LONG_BIT is smaller than 32"
870#endif
871
Thomas Wouters477c8d52006-05-27 19:21:47 +0000872#define BLOOM_MASK unsigned long
873
Serhiy Storchaka05997252013-01-26 12:14:02 +0200874static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000875
Antoine Pitrouf068f942010-01-13 14:19:12 +0000876#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000877
Benjamin Peterson29060642009-01-31 22:14:21 +0000878#define BLOOM_LINEBREAK(ch) \
879 ((ch) < 128U ? ascii_linebreak[(ch)] : \
880 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000881
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700882static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300883make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000884{
Victor Stinnera85af502013-04-09 21:53:54 +0200885#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
886 do { \
887 TYPE *data = (TYPE *)PTR; \
888 TYPE *end = data + LEN; \
889 Py_UCS4 ch; \
890 for (; data != end; data++) { \
891 ch = *data; \
892 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
893 } \
894 break; \
895 } while (0)
896
Thomas Wouters477c8d52006-05-27 19:21:47 +0000897 /* calculate simple bloom-style bitmask for a given unicode string */
898
Antoine Pitrouf068f942010-01-13 14:19:12 +0000899 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000900
901 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200902 switch (kind) {
903 case PyUnicode_1BYTE_KIND:
904 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
905 break;
906 case PyUnicode_2BYTE_KIND:
907 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
908 break;
909 case PyUnicode_4BYTE_KIND:
910 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
911 break;
912 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700913 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200914 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000915 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200916
917#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000918}
919
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300920static int
921ensure_unicode(PyObject *obj)
922{
923 if (!PyUnicode_Check(obj)) {
924 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200925 "must be str, not %.100s",
926 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300927 return -1;
928 }
929 return PyUnicode_READY(obj);
930}
931
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200932/* Compilation of templated routines */
933
934#include "stringlib/asciilib.h"
935#include "stringlib/fastsearch.h"
936#include "stringlib/partition.h"
937#include "stringlib/split.h"
938#include "stringlib/count.h"
939#include "stringlib/find.h"
940#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200941#include "stringlib/undef.h"
942
943#include "stringlib/ucs1lib.h"
944#include "stringlib/fastsearch.h"
945#include "stringlib/partition.h"
946#include "stringlib/split.h"
947#include "stringlib/count.h"
948#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300949#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200950#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200951#include "stringlib/undef.h"
952
953#include "stringlib/ucs2lib.h"
954#include "stringlib/fastsearch.h"
955#include "stringlib/partition.h"
956#include "stringlib/split.h"
957#include "stringlib/count.h"
958#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300959#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200960#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200961#include "stringlib/undef.h"
962
963#include "stringlib/ucs4lib.h"
964#include "stringlib/fastsearch.h"
965#include "stringlib/partition.h"
966#include "stringlib/split.h"
967#include "stringlib/count.h"
968#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300969#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200970#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200971#include "stringlib/undef.h"
972
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200973#include "stringlib/unicodedefs.h"
974#include "stringlib/fastsearch.h"
975#include "stringlib/count.h"
976#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100977#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200978
Guido van Rossumd57fd912000-03-10 22:53:23 +0000979/* --- Unicode Object ----------------------------------------------------- */
980
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700981static inline Py_ssize_t
982findchar(const void *s, int kind,
983 Py_ssize_t size, Py_UCS4 ch,
984 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200986 switch (kind) {
987 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200988 if ((Py_UCS1) ch != ch)
989 return -1;
990 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600991 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200992 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600993 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200994 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200995 if ((Py_UCS2) ch != ch)
996 return -1;
997 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600998 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200999 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001000 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001001 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001002 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001003 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001004 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001005 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001006 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07001007 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +02001008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001009}
1010
Victor Stinnerafffce42012-10-03 23:03:17 +02001011#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +00001012/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +02001013 earlier.
1014
1015 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1016 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1017 invalid character in Unicode 6.0. */
1018static void
1019unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1020{
1021 int kind = PyUnicode_KIND(unicode);
1022 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1023 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1024 if (length <= old_length)
1025 return;
1026 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1027}
1028#endif
1029
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030static PyObject*
1031resize_compact(PyObject *unicode, Py_ssize_t length)
1032{
1033 Py_ssize_t char_size;
1034 Py_ssize_t struct_size;
1035 Py_ssize_t new_size;
1036 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001037 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001038#ifdef Py_DEBUG
1039 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1040#endif
1041
Victor Stinner79891572012-05-03 13:43:07 +02001042 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001043 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001044 assert(PyUnicode_IS_COMPACT(unicode));
1045
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001046 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001047 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001048 struct_size = sizeof(PyASCIIObject);
1049 else
1050 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001051 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001052
Victor Stinnerfe226c02011-10-03 03:52:20 +02001053 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1054 PyErr_NoMemory();
1055 return NULL;
1056 }
1057 new_size = (struct_size + (length + 1) * char_size);
1058
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001059 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1060 PyObject_DEL(_PyUnicode_UTF8(unicode));
1061 _PyUnicode_UTF8(unicode) = NULL;
1062 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1063 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001064#ifdef Py_REF_DEBUG
1065 _Py_RefTotal--;
1066#endif
1067#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001068 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001069#endif
Victor Stinner84def372011-12-11 20:04:56 +01001070
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001071 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001072 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001073 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001074 PyErr_NoMemory();
1075 return NULL;
1076 }
Victor Stinner84def372011-12-11 20:04:56 +01001077 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001079
Victor Stinnerfe226c02011-10-03 03:52:20 +02001080 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001081 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001082 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001083 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001084 _PyUnicode_WSTR_LENGTH(unicode) = length;
1085 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001086 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1087 PyObject_DEL(_PyUnicode_WSTR(unicode));
1088 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001089 if (!PyUnicode_IS_ASCII(unicode))
1090 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001091 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001092#ifdef Py_DEBUG
1093 unicode_fill_invalid(unicode, old_length);
1094#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001095 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1096 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001097 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001098 return unicode;
1099}
1100
Alexander Belopolsky40018472011-02-26 01:02:56 +00001101static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001102resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103{
Victor Stinner95663112011-10-04 01:03:50 +02001104 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001105 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001106 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001107 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001108
Victor Stinnerfe226c02011-10-03 03:52:20 +02001109 if (PyUnicode_IS_READY(unicode)) {
1110 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001111 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001112 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001113#ifdef Py_DEBUG
1114 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1115#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001116
1117 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001118 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001119 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1120 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001121
1122 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1123 PyErr_NoMemory();
1124 return -1;
1125 }
1126 new_size = (length + 1) * char_size;
1127
Victor Stinner7a9105a2011-12-12 00:13:42 +01001128 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1129 {
1130 PyObject_DEL(_PyUnicode_UTF8(unicode));
1131 _PyUnicode_UTF8(unicode) = NULL;
1132 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1133 }
1134
Victor Stinnerfe226c02011-10-03 03:52:20 +02001135 data = (PyObject *)PyObject_REALLOC(data, new_size);
1136 if (data == NULL) {
1137 PyErr_NoMemory();
1138 return -1;
1139 }
1140 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001141 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001142 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001143 _PyUnicode_WSTR_LENGTH(unicode) = length;
1144 }
1145 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001146 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001147 _PyUnicode_UTF8_LENGTH(unicode) = length;
1148 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001149 _PyUnicode_LENGTH(unicode) = length;
1150 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001151#ifdef Py_DEBUG
1152 unicode_fill_invalid(unicode, old_length);
1153#endif
Victor Stinner95663112011-10-04 01:03:50 +02001154 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001155 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001156 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001157 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001158 }
Victor Stinner95663112011-10-04 01:03:50 +02001159 assert(_PyUnicode_WSTR(unicode) != NULL);
1160
1161 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001162 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001163 PyErr_NoMemory();
1164 return -1;
1165 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001166 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001167 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001168 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001169 if (!wstr) {
1170 PyErr_NoMemory();
1171 return -1;
1172 }
1173 _PyUnicode_WSTR(unicode) = wstr;
1174 _PyUnicode_WSTR(unicode)[length] = 0;
1175 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001176 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 return 0;
1178}
1179
Victor Stinnerfe226c02011-10-03 03:52:20 +02001180static PyObject*
1181resize_copy(PyObject *unicode, Py_ssize_t length)
1182{
1183 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001184 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001185 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001186
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001187 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001188
1189 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1190 if (copy == NULL)
1191 return NULL;
1192
1193 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001194 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001195 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001196 }
1197 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001198 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001199
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001200 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001201 if (w == NULL)
1202 return NULL;
1203 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1204 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001205 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001206 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001207 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001208 }
1209}
1210
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001212 Ux0000 terminated; some code (e.g. new_identifier)
1213 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214
1215 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001216 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217
1218*/
1219
Alexander Belopolsky40018472011-02-26 01:02:56 +00001220static PyUnicodeObject *
1221_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001222{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001223 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225
Thomas Wouters477c8d52006-05-27 19:21:47 +00001226 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001227 if (length == 0 && unicode_empty != NULL) {
1228 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001229 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230 }
1231
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001232 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001233 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001234 return (PyUnicodeObject *)PyErr_NoMemory();
1235 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001236 if (length < 0) {
1237 PyErr_SetString(PyExc_SystemError,
1238 "Negative size passed to _PyUnicode_New");
1239 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 }
1241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1243 if (unicode == NULL)
1244 return NULL;
1245 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001246
1247 _PyUnicode_WSTR_LENGTH(unicode) = length;
1248 _PyUnicode_HASH(unicode) = -1;
1249 _PyUnicode_STATE(unicode).interned = 0;
1250 _PyUnicode_STATE(unicode).kind = 0;
1251 _PyUnicode_STATE(unicode).compact = 0;
1252 _PyUnicode_STATE(unicode).ready = 0;
1253 _PyUnicode_STATE(unicode).ascii = 0;
1254 _PyUnicode_DATA_ANY(unicode) = NULL;
1255 _PyUnicode_LENGTH(unicode) = 0;
1256 _PyUnicode_UTF8(unicode) = NULL;
1257 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001259 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1260 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001261 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001262 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001263 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001264 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001265
Jeremy Hyltond8082792003-09-16 19:41:39 +00001266 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001267 * the caller fails before initializing str -- unicode_resize()
1268 * reads str[0], and the Keep-Alive optimization can keep memory
1269 * allocated for str alive across a call to unicode_dealloc(unicode).
1270 * We don't want unicode_resize to read uninitialized memory in
1271 * that case.
1272 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001273 _PyUnicode_WSTR(unicode)[0] = 0;
1274 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001275
Victor Stinner7931d9a2011-11-04 00:22:48 +01001276 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277 return unicode;
1278}
1279
Victor Stinnerf42dc442011-10-02 23:33:16 +02001280static const char*
1281unicode_kind_name(PyObject *unicode)
1282{
Victor Stinner42dfd712011-10-03 14:41:45 +02001283 /* don't check consistency: unicode_kind_name() is called from
1284 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001285 if (!PyUnicode_IS_COMPACT(unicode))
1286 {
1287 if (!PyUnicode_IS_READY(unicode))
1288 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001289 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001290 {
1291 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001292 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001293 return "legacy ascii";
1294 else
1295 return "legacy latin1";
1296 case PyUnicode_2BYTE_KIND:
1297 return "legacy UCS2";
1298 case PyUnicode_4BYTE_KIND:
1299 return "legacy UCS4";
1300 default:
1301 return "<legacy invalid kind>";
1302 }
1303 }
1304 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001305 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001306 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001307 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001308 return "ascii";
1309 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001310 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001311 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001312 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001313 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001314 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001315 default:
1316 return "<invalid compact kind>";
1317 }
1318}
1319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001322const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001323 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001324 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001325}
1326
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001327const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001328 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001329 return _PyUnicode_COMPACT_DATA(unicode);
1330}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001331const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001332 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001333 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1335 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1336 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1337 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1338 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1339 return PyUnicode_DATA(unicode);
1340}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001341
1342void
1343_PyUnicode_Dump(PyObject *op)
1344{
1345 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001346 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1347 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001348 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001349
Victor Stinnera849a4b2011-10-03 12:12:11 +02001350 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001351 {
1352 if (ascii->state.ascii)
1353 data = (ascii + 1);
1354 else
1355 data = (compact + 1);
1356 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001357 else
1358 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001359 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1360 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001361
Victor Stinnera849a4b2011-10-03 12:12:11 +02001362 if (ascii->wstr == data)
1363 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001364 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001365
Victor Stinnera3b334d2011-10-03 13:53:37 +02001366 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001367 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001368 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1369 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001370 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001371 (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001372 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001373 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001374}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375#endif
1376
1377PyObject *
1378PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1379{
1380 PyObject *obj;
1381 PyCompactUnicodeObject *unicode;
1382 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001383 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001384 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 Py_ssize_t char_size;
1386 Py_ssize_t struct_size;
1387
1388 /* Optimization for empty strings */
1389 if (size == 0 && unicode_empty != NULL) {
1390 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001391 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392 }
1393
Victor Stinner9e9d6892011-10-04 01:02:02 +02001394 is_ascii = 0;
1395 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001396 struct_size = sizeof(PyCompactUnicodeObject);
1397 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001398 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 char_size = 1;
1400 is_ascii = 1;
1401 struct_size = sizeof(PyASCIIObject);
1402 }
1403 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001404 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405 char_size = 1;
1406 }
1407 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001408 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 char_size = 2;
1410 if (sizeof(wchar_t) == 2)
1411 is_sharing = 1;
1412 }
1413 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001414 if (maxchar > MAX_UNICODE) {
1415 PyErr_SetString(PyExc_SystemError,
1416 "invalid maximum character passed to PyUnicode_New");
1417 return NULL;
1418 }
Victor Stinner8f825062012-04-27 13:55:39 +02001419 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420 char_size = 4;
1421 if (sizeof(wchar_t) == 4)
1422 is_sharing = 1;
1423 }
1424
1425 /* Ensure we won't overflow the size. */
1426 if (size < 0) {
1427 PyErr_SetString(PyExc_SystemError,
1428 "Negative size passed to PyUnicode_New");
1429 return NULL;
1430 }
1431 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1432 return PyErr_NoMemory();
1433
1434 /* Duplicated allocation code from _PyObject_New() instead of a call to
1435 * PyObject_New() so we are able to allocate space for the object and
1436 * it's data buffer.
1437 */
1438 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1439 if (obj == NULL)
1440 return PyErr_NoMemory();
1441 obj = PyObject_INIT(obj, &PyUnicode_Type);
1442 if (obj == NULL)
1443 return NULL;
1444
1445 unicode = (PyCompactUnicodeObject *)obj;
1446 if (is_ascii)
1447 data = ((PyASCIIObject*)obj) + 1;
1448 else
1449 data = unicode + 1;
1450 _PyUnicode_LENGTH(unicode) = size;
1451 _PyUnicode_HASH(unicode) = -1;
1452 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001453 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001454 _PyUnicode_STATE(unicode).compact = 1;
1455 _PyUnicode_STATE(unicode).ready = 1;
1456 _PyUnicode_STATE(unicode).ascii = is_ascii;
1457 if (is_ascii) {
1458 ((char*)data)[size] = 0;
1459 _PyUnicode_WSTR(unicode) = NULL;
1460 }
Victor Stinner8f825062012-04-27 13:55:39 +02001461 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 ((char*)data)[size] = 0;
1463 _PyUnicode_WSTR(unicode) = NULL;
1464 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001466 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001467 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 else {
1469 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001470 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001471 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001473 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474 ((Py_UCS4*)data)[size] = 0;
1475 if (is_sharing) {
1476 _PyUnicode_WSTR_LENGTH(unicode) = size;
1477 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1478 }
1479 else {
1480 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1481 _PyUnicode_WSTR(unicode) = NULL;
1482 }
1483 }
Victor Stinner8f825062012-04-27 13:55:39 +02001484#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001485 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001486#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001487 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 return obj;
1489}
1490
1491#if SIZEOF_WCHAR_T == 2
1492/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1493 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001494 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001495
1496 This function assumes that unicode can hold one more code point than wstr
1497 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001498static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001499unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001500 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001501{
1502 const wchar_t *iter;
1503 Py_UCS4 *ucs4_out;
1504
Victor Stinner910337b2011-10-03 03:20:16 +02001505 assert(unicode != NULL);
1506 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001507 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1508 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1509
1510 for (iter = begin; iter < end; ) {
1511 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1512 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001513 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1514 && (iter+1) < end
1515 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001516 {
Victor Stinner551ac952011-11-29 22:58:13 +01001517 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001518 iter += 2;
1519 }
1520 else {
1521 *ucs4_out++ = *iter;
1522 iter++;
1523 }
1524 }
1525 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1526 _PyUnicode_GET_LENGTH(unicode)));
1527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001528}
1529#endif
1530
Victor Stinnercd9950f2011-10-02 00:34:53 +02001531static int
Victor Stinner488fa492011-12-12 00:01:39 +01001532unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001533{
Victor Stinner488fa492011-12-12 00:01:39 +01001534 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001535 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001536 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001537 return -1;
1538 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001539 return 0;
1540}
1541
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001542static int
1543_copy_characters(PyObject *to, Py_ssize_t to_start,
1544 PyObject *from, Py_ssize_t from_start,
1545 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001546{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001547 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001548 const void *from_data;
1549 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001550
Victor Stinneree4544c2012-05-09 22:24:08 +02001551 assert(0 <= how_many);
1552 assert(0 <= from_start);
1553 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001554 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001555 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001556 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001557
Victor Stinnerd3f08822012-05-29 12:57:52 +02001558 assert(PyUnicode_Check(to));
1559 assert(PyUnicode_IS_READY(to));
1560 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1561
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001562 if (how_many == 0)
1563 return 0;
1564
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001565 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001566 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001567 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001568 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001569
Victor Stinnerf1852262012-06-16 16:38:26 +02001570#ifdef Py_DEBUG
1571 if (!check_maxchar
1572 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1573 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001574 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001575 Py_UCS4 ch;
1576 Py_ssize_t i;
1577 for (i=0; i < how_many; i++) {
1578 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1579 assert(ch <= to_maxchar);
1580 }
1581 }
1582#endif
1583
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001584 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001585 if (check_maxchar
1586 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1587 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001588 /* Writing Latin-1 characters into an ASCII string requires to
1589 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001590 Py_UCS4 max_char;
1591 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001592 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001593 if (max_char >= 128)
1594 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001595 }
Christian Heimesf051e432016-09-13 20:22:02 +02001596 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001597 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001598 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001599 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001600 else if (from_kind == PyUnicode_1BYTE_KIND
1601 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001602 {
1603 _PyUnicode_CONVERT_BYTES(
1604 Py_UCS1, Py_UCS2,
1605 PyUnicode_1BYTE_DATA(from) + from_start,
1606 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1607 PyUnicode_2BYTE_DATA(to) + to_start
1608 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001609 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001610 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001611 && to_kind == PyUnicode_4BYTE_KIND)
1612 {
1613 _PyUnicode_CONVERT_BYTES(
1614 Py_UCS1, Py_UCS4,
1615 PyUnicode_1BYTE_DATA(from) + from_start,
1616 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1617 PyUnicode_4BYTE_DATA(to) + to_start
1618 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001619 }
1620 else if (from_kind == PyUnicode_2BYTE_KIND
1621 && to_kind == PyUnicode_4BYTE_KIND)
1622 {
1623 _PyUnicode_CONVERT_BYTES(
1624 Py_UCS2, Py_UCS4,
1625 PyUnicode_2BYTE_DATA(from) + from_start,
1626 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1627 PyUnicode_4BYTE_DATA(to) + to_start
1628 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001629 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001630 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001631 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1632
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001633 if (!check_maxchar) {
1634 if (from_kind == PyUnicode_2BYTE_KIND
1635 && to_kind == PyUnicode_1BYTE_KIND)
1636 {
1637 _PyUnicode_CONVERT_BYTES(
1638 Py_UCS2, Py_UCS1,
1639 PyUnicode_2BYTE_DATA(from) + from_start,
1640 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1641 PyUnicode_1BYTE_DATA(to) + to_start
1642 );
1643 }
1644 else if (from_kind == PyUnicode_4BYTE_KIND
1645 && to_kind == PyUnicode_1BYTE_KIND)
1646 {
1647 _PyUnicode_CONVERT_BYTES(
1648 Py_UCS4, Py_UCS1,
1649 PyUnicode_4BYTE_DATA(from) + from_start,
1650 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1651 PyUnicode_1BYTE_DATA(to) + to_start
1652 );
1653 }
1654 else if (from_kind == PyUnicode_4BYTE_KIND
1655 && to_kind == PyUnicode_2BYTE_KIND)
1656 {
1657 _PyUnicode_CONVERT_BYTES(
1658 Py_UCS4, Py_UCS2,
1659 PyUnicode_4BYTE_DATA(from) + from_start,
1660 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1661 PyUnicode_2BYTE_DATA(to) + to_start
1662 );
1663 }
1664 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001665 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001666 }
1667 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001668 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001669 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001670 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001671 Py_ssize_t i;
1672
Victor Stinnera0702ab2011-09-29 14:14:38 +02001673 for (i=0; i < how_many; i++) {
1674 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001675 if (ch > to_maxchar)
1676 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001677 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1678 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001679 }
1680 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001681 return 0;
1682}
1683
Victor Stinnerd3f08822012-05-29 12:57:52 +02001684void
1685_PyUnicode_FastCopyCharacters(
1686 PyObject *to, Py_ssize_t to_start,
1687 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001688{
1689 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1690}
1691
1692Py_ssize_t
1693PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1694 PyObject *from, Py_ssize_t from_start,
1695 Py_ssize_t how_many)
1696{
1697 int err;
1698
1699 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1700 PyErr_BadInternalCall();
1701 return -1;
1702 }
1703
Benjamin Petersonbac79492012-01-14 13:34:47 -05001704 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001705 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001706 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001707 return -1;
1708
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001709 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001710 PyErr_SetString(PyExc_IndexError, "string index out of range");
1711 return -1;
1712 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001713 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001714 PyErr_SetString(PyExc_IndexError, "string index out of range");
1715 return -1;
1716 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001717 if (how_many < 0) {
1718 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1719 return -1;
1720 }
1721 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001722 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1723 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001724 "Cannot write %zi characters at %zi "
1725 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001726 how_many, to_start, PyUnicode_GET_LENGTH(to));
1727 return -1;
1728 }
1729
1730 if (how_many == 0)
1731 return 0;
1732
Victor Stinner488fa492011-12-12 00:01:39 +01001733 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001734 return -1;
1735
1736 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1737 if (err) {
1738 PyErr_Format(PyExc_SystemError,
1739 "Cannot copy %s characters "
1740 "into a string of %s characters",
1741 unicode_kind_name(from),
1742 unicode_kind_name(to));
1743 return -1;
1744 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001745 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001746}
1747
Victor Stinner17222162011-09-28 22:15:37 +02001748/* Find the maximum code point and count the number of surrogate pairs so a
1749 correct string length can be computed before converting a string to UCS4.
1750 This function counts single surrogates as a character and not as a pair.
1751
1752 Return 0 on success, or -1 on error. */
1753static int
1754find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1755 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756{
1757 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001758 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001759
Victor Stinnerc53be962011-10-02 21:33:54 +02001760 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 *num_surrogates = 0;
1762 *maxchar = 0;
1763
1764 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001766 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1767 && (iter+1) < end
1768 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1769 {
1770 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1771 ++(*num_surrogates);
1772 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773 }
1774 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001776 {
1777 ch = *iter;
1778 iter++;
1779 }
1780 if (ch > *maxchar) {
1781 *maxchar = ch;
1782 if (*maxchar > MAX_UNICODE) {
1783 PyErr_Format(PyExc_ValueError,
1784 "character U+%x is not in range [U+0000; U+10ffff]",
1785 ch);
1786 return -1;
1787 }
1788 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789 }
1790 return 0;
1791}
1792
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001793int
1794_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001795{
1796 wchar_t *end;
1797 Py_UCS4 maxchar = 0;
1798 Py_ssize_t num_surrogates;
1799#if SIZEOF_WCHAR_T == 2
1800 Py_ssize_t length_wo_surrogates;
1801#endif
1802
Georg Brandl7597add2011-10-05 16:36:47 +02001803 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001804 strings were created using _PyObject_New() and where no canonical
1805 representation (the str field) has been set yet aka strings
1806 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001807 assert(_PyUnicode_CHECK(unicode));
1808 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001809 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001810 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001811 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001812 /* Actually, it should neither be interned nor be anything else: */
1813 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001815 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001816 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001817 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001819
1820 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001821 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1822 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 PyErr_NoMemory();
1824 return -1;
1825 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001826 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827 _PyUnicode_WSTR(unicode), end,
1828 PyUnicode_1BYTE_DATA(unicode));
1829 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1830 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1831 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1832 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001833 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001834 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001835 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001836 }
1837 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001838 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001839 _PyUnicode_UTF8(unicode) = NULL;
1840 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001841 }
1842 PyObject_FREE(_PyUnicode_WSTR(unicode));
1843 _PyUnicode_WSTR(unicode) = NULL;
1844 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1845 }
1846 /* In this case we might have to convert down from 4-byte native
1847 wchar_t to 2-byte unicode. */
1848 else if (maxchar < 65536) {
1849 assert(num_surrogates == 0 &&
1850 "FindMaxCharAndNumSurrogatePairs() messed up");
1851
Victor Stinner506f5922011-09-28 22:34:18 +02001852#if SIZEOF_WCHAR_T == 2
1853 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001854 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001855 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1856 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1857 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001858 _PyUnicode_UTF8(unicode) = NULL;
1859 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001860#else
1861 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001862 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001863 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001864 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001865 PyErr_NoMemory();
1866 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001867 }
Victor Stinner506f5922011-09-28 22:34:18 +02001868 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1869 _PyUnicode_WSTR(unicode), end,
1870 PyUnicode_2BYTE_DATA(unicode));
1871 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1872 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1873 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001874 _PyUnicode_UTF8(unicode) = NULL;
1875 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001876 PyObject_FREE(_PyUnicode_WSTR(unicode));
1877 _PyUnicode_WSTR(unicode) = NULL;
1878 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1879#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001880 }
1881 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1882 else {
1883#if SIZEOF_WCHAR_T == 2
1884 /* in case the native representation is 2-bytes, we need to allocate a
1885 new normalized 4-byte version. */
1886 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001887 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1888 PyErr_NoMemory();
1889 return -1;
1890 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001891 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1892 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001893 PyErr_NoMemory();
1894 return -1;
1895 }
1896 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1897 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001898 _PyUnicode_UTF8(unicode) = NULL;
1899 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001900 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1901 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001902 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001903 PyObject_FREE(_PyUnicode_WSTR(unicode));
1904 _PyUnicode_WSTR(unicode) = NULL;
1905 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1906#else
1907 assert(num_surrogates == 0);
1908
Victor Stinnerc3c74152011-10-02 20:39:55 +02001909 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001910 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001911 _PyUnicode_UTF8(unicode) = NULL;
1912 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001913 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1914#endif
1915 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1916 }
1917 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001918 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001919 return 0;
1920}
1921
Alexander Belopolsky40018472011-02-26 01:02:56 +00001922static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001923unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001924{
Walter Dörwald16807132007-05-25 13:52:07 +00001925 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001926 case SSTATE_NOT_INTERNED:
1927 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001928
Benjamin Peterson29060642009-01-31 22:14:21 +00001929 case SSTATE_INTERNED_MORTAL:
1930 /* revive dead object temporarily for DelItem */
Victor Stinnerc86a1122020-02-07 01:24:29 +01001931 Py_SET_REFCNT(unicode, 3);
Victor Stinner607b1022020-05-05 18:50:30 +02001932#ifdef INTERNED_STRINGS
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001933 if (PyDict_DelItem(interned, unicode) != 0) {
1934 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1935 NULL);
1936 }
Victor Stinner607b1022020-05-05 18:50:30 +02001937#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001938 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001939
Benjamin Peterson29060642009-01-31 22:14:21 +00001940 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001941 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1942 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001943
Benjamin Peterson29060642009-01-31 22:14:21 +00001944 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001945 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001946 }
1947
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001948 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001949 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001950 }
1951 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001952 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001953 }
1954 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001955 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001956 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001957
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001958 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001959}
1960
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001961#ifdef Py_DEBUG
1962static int
1963unicode_is_singleton(PyObject *unicode)
1964{
Victor Stinner607b1022020-05-05 18:50:30 +02001965 if (unicode == unicode_empty) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001966 return 1;
Victor Stinner607b1022020-05-05 18:50:30 +02001967 }
1968#ifdef LATIN1_SINGLETONS
1969 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001970 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1971 {
1972 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1973 if (ch < 256 && unicode_latin1[ch] == unicode)
1974 return 1;
1975 }
Victor Stinner607b1022020-05-05 18:50:30 +02001976#endif
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001977 return 0;
1978}
1979#endif
1980
Alexander Belopolsky40018472011-02-26 01:02:56 +00001981static int
Victor Stinner488fa492011-12-12 00:01:39 +01001982unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001983{
Victor Stinner488fa492011-12-12 00:01:39 +01001984 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001985 if (Py_REFCNT(unicode) != 1)
1986 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001987 if (_PyUnicode_HASH(unicode) != -1)
1988 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001989 if (PyUnicode_CHECK_INTERNED(unicode))
1990 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001991 if (!PyUnicode_CheckExact(unicode))
1992 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001993#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001994 /* singleton refcount is greater than 1 */
1995 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001996#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001997 return 1;
1998}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001999
Victor Stinnerfe226c02011-10-03 03:52:20 +02002000static int
2001unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2002{
2003 PyObject *unicode;
2004 Py_ssize_t old_length;
2005
2006 assert(p_unicode != NULL);
2007 unicode = *p_unicode;
2008
2009 assert(unicode != NULL);
2010 assert(PyUnicode_Check(unicode));
2011 assert(0 <= length);
2012
Victor Stinner910337b2011-10-03 03:20:16 +02002013 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002014 old_length = PyUnicode_WSTR_LENGTH(unicode);
2015 else
2016 old_length = PyUnicode_GET_LENGTH(unicode);
2017 if (old_length == length)
2018 return 0;
2019
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002020 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02002021 _Py_INCREF_UNICODE_EMPTY();
2022 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00002023 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002024 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002025 return 0;
2026 }
2027
Victor Stinner488fa492011-12-12 00:01:39 +01002028 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002029 PyObject *copy = resize_copy(unicode, length);
2030 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002031 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002032 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002033 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002034 }
2035
Victor Stinnerfe226c02011-10-03 03:52:20 +02002036 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002037 PyObject *new_unicode = resize_compact(unicode, length);
2038 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002039 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002040 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002041 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002042 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002043 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002044}
2045
Alexander Belopolsky40018472011-02-26 01:02:56 +00002046int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002047PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002048{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002049 PyObject *unicode;
2050 if (p_unicode == NULL) {
2051 PyErr_BadInternalCall();
2052 return -1;
2053 }
2054 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002055 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002056 {
2057 PyErr_BadInternalCall();
2058 return -1;
2059 }
2060 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002061}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002062
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002063/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002064
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002065 WARNING: The function doesn't copy the terminating null character and
2066 doesn't check the maximum character (may write a latin1 character in an
2067 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002068static void
2069unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2070 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002071{
2072 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002073 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002074 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002075
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002076 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002077 switch (kind) {
2078 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002079#ifdef Py_DEBUG
2080 if (PyUnicode_IS_ASCII(unicode)) {
2081 Py_UCS4 maxchar = ucs1lib_find_max_char(
2082 (const Py_UCS1*)str,
2083 (const Py_UCS1*)str + len);
2084 assert(maxchar < 128);
2085 }
2086#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002087 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002088 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002089 }
2090 case PyUnicode_2BYTE_KIND: {
2091 Py_UCS2 *start = (Py_UCS2 *)data + index;
2092 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002093
Victor Stinner184252a2012-06-16 02:57:41 +02002094 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002095 *ucs2 = (Py_UCS2)*str;
2096
2097 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002098 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002099 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002100 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002101 Py_UCS4 *start = (Py_UCS4 *)data + index;
2102 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002103
Victor Stinner184252a2012-06-16 02:57:41 +02002104 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002105 *ucs4 = (Py_UCS4)*str;
2106
2107 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002108 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002109 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002110 default:
2111 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002112 }
2113}
2114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002115static PyObject*
2116get_latin1_char(unsigned char ch)
2117{
Victor Stinner607b1022020-05-05 18:50:30 +02002118 PyObject *unicode;
2119
2120#ifdef LATIN1_SINGLETONS
2121 unicode = unicode_latin1[ch];
2122 if (unicode) {
2123 Py_INCREF(unicode);
2124 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002125 }
Victor Stinner607b1022020-05-05 18:50:30 +02002126#endif
2127
2128 unicode = PyUnicode_New(1, ch);
2129 if (!unicode) {
2130 return NULL;
2131 }
2132
2133 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2134 assert(_PyUnicode_CheckConsistency(unicode, 1));
2135
2136#ifdef LATIN1_SINGLETONS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002137 Py_INCREF(unicode);
Victor Stinner607b1022020-05-05 18:50:30 +02002138 unicode_latin1[ch] = unicode;
2139#endif
Victor Stinnera464fc12011-10-02 20:39:30 +02002140 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002141}
2142
Victor Stinner985a82a2014-01-03 12:53:47 +01002143static PyObject*
2144unicode_char(Py_UCS4 ch)
2145{
2146 PyObject *unicode;
2147
2148 assert(ch <= MAX_UNICODE);
2149
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002150 if (ch < 256)
2151 return get_latin1_char(ch);
2152
Victor Stinner985a82a2014-01-03 12:53:47 +01002153 unicode = PyUnicode_New(1, ch);
2154 if (unicode == NULL)
2155 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002156
2157 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2158 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002159 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002160 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002161 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2162 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2163 }
2164 assert(_PyUnicode_CheckConsistency(unicode, 1));
2165 return unicode;
2166}
2167
Alexander Belopolsky40018472011-02-26 01:02:56 +00002168PyObject *
2169PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002171 if (u == NULL)
2172 return (PyObject*)_PyUnicode_New(size);
2173
2174 if (size < 0) {
2175 PyErr_BadInternalCall();
2176 return NULL;
2177 }
2178
2179 return PyUnicode_FromWideChar(u, size);
2180}
2181
2182PyObject *
2183PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2184{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002185 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002186 Py_UCS4 maxchar = 0;
2187 Py_ssize_t num_surrogates;
2188
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002189 if (u == NULL && size != 0) {
2190 PyErr_BadInternalCall();
2191 return NULL;
2192 }
2193
2194 if (size == -1) {
2195 size = wcslen(u);
2196 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002198 /* If the Unicode data is known at construction time, we can apply
2199 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002200
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002201 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002202 if (size == 0)
2203 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002204
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 /* Single character Unicode objects in the Latin-1 range are
2206 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002207 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208 return get_latin1_char((unsigned char)*u);
2209
2210 /* If not empty and not single character, copy the Unicode data
2211 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002212 if (find_maxchar_surrogates(u, u + size,
2213 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002214 return NULL;
2215
Victor Stinner8faf8212011-12-08 22:14:11 +01002216 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217 if (!unicode)
2218 return NULL;
2219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 switch (PyUnicode_KIND(unicode)) {
2221 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002222 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2224 break;
2225 case PyUnicode_2BYTE_KIND:
2226#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002227 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002228#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002229 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002230 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2231#endif
2232 break;
2233 case PyUnicode_4BYTE_KIND:
2234#if SIZEOF_WCHAR_T == 2
2235 /* This is the only case which has to process surrogates, thus
2236 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002237 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238#else
2239 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002240 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002241#endif
2242 break;
2243 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002244 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002245 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002246
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002247 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248}
2249
Alexander Belopolsky40018472011-02-26 01:02:56 +00002250PyObject *
2251PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002252{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002253 if (size < 0) {
2254 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002255 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002256 return NULL;
2257 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002258 if (u != NULL)
2259 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2260 else
2261 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002262}
2263
Alexander Belopolsky40018472011-02-26 01:02:56 +00002264PyObject *
2265PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002266{
2267 size_t size = strlen(u);
2268 if (size > PY_SSIZE_T_MAX) {
2269 PyErr_SetString(PyExc_OverflowError, "input too long");
2270 return NULL;
2271 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002272 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002273}
2274
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002275PyObject *
2276_PyUnicode_FromId(_Py_Identifier *id)
2277{
2278 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002279 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2280 strlen(id->string),
2281 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002282 if (!id->object)
2283 return NULL;
2284 PyUnicode_InternInPlace(&id->object);
2285 assert(!id->next);
2286 id->next = static_strings;
2287 static_strings = id;
2288 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002289 return id->object;
2290}
2291
Victor Stinnerd6fb53f2020-05-14 01:11:54 +02002292static void
2293unicode_clear_static_strings(void)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002294{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002295 _Py_Identifier *tmp, *s = static_strings;
2296 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002297 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002298 tmp = s->next;
2299 s->next = NULL;
2300 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002301 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002302 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002303}
2304
Benjamin Peterson0df54292012-03-26 14:50:32 -04002305/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002306
Victor Stinnerd3f08822012-05-29 12:57:52 +02002307PyObject*
2308_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002309{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002310 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002311 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002312 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002313#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002314 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002315#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002316 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002317 }
Victor Stinner785938e2011-12-11 20:09:03 +01002318 unicode = PyUnicode_New(size, 127);
2319 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002320 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002321 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2322 assert(_PyUnicode_CheckConsistency(unicode, 1));
2323 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002324}
2325
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002326static Py_UCS4
2327kind_maxchar_limit(unsigned int kind)
2328{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002329 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002330 case PyUnicode_1BYTE_KIND:
2331 return 0x80;
2332 case PyUnicode_2BYTE_KIND:
2333 return 0x100;
2334 case PyUnicode_4BYTE_KIND:
2335 return 0x10000;
2336 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002337 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002338 }
2339}
2340
Victor Stinner702c7342011-10-05 13:50:52 +02002341static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002342_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002343{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002344 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002345 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002346
Serhiy Storchaka678db842013-01-26 12:16:36 +02002347 if (size == 0)
2348 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002349 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002350 if (size == 1)
2351 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002352
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002353 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002354 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002355 if (!res)
2356 return NULL;
2357 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002358 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002359 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002360}
2361
Victor Stinnere57b1c02011-09-28 22:20:48 +02002362static PyObject*
2363_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002364{
2365 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002366 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002367
Serhiy Storchaka678db842013-01-26 12:16:36 +02002368 if (size == 0)
2369 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002370 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002371 if (size == 1)
2372 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002373
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002374 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002375 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002376 if (!res)
2377 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002378 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002379 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002380 else {
2381 _PyUnicode_CONVERT_BYTES(
2382 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2383 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002384 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002385 return res;
2386}
2387
Victor Stinnere57b1c02011-09-28 22:20:48 +02002388static PyObject*
2389_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002390{
2391 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002392 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002393
Serhiy Storchaka678db842013-01-26 12:16:36 +02002394 if (size == 0)
2395 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002396 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002397 if (size == 1)
2398 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002399
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002400 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002401 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002402 if (!res)
2403 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002404 if (max_char < 256)
2405 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2406 PyUnicode_1BYTE_DATA(res));
2407 else if (max_char < 0x10000)
2408 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2409 PyUnicode_2BYTE_DATA(res));
2410 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002411 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002412 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002413 return res;
2414}
2415
2416PyObject*
2417PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2418{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002419 if (size < 0) {
2420 PyErr_SetString(PyExc_ValueError, "size must be positive");
2421 return NULL;
2422 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002423 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002424 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002425 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002426 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002427 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002428 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002429 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002430 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002431 PyErr_SetString(PyExc_SystemError, "invalid kind");
2432 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002434}
2435
Victor Stinnerece58de2012-04-23 23:36:38 +02002436Py_UCS4
2437_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2438{
2439 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002440 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002441
2442 assert(PyUnicode_IS_READY(unicode));
2443 assert(0 <= start);
2444 assert(end <= PyUnicode_GET_LENGTH(unicode));
2445 assert(start <= end);
2446
2447 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2448 return PyUnicode_MAX_CHAR_VALUE(unicode);
2449
2450 if (start == end)
2451 return 127;
2452
Victor Stinner94d558b2012-04-27 22:26:58 +02002453 if (PyUnicode_IS_ASCII(unicode))
2454 return 127;
2455
Victor Stinnerece58de2012-04-23 23:36:38 +02002456 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002457 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002458 endptr = (char *)startptr + end * kind;
2459 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002460 switch(kind) {
2461 case PyUnicode_1BYTE_KIND:
2462 return ucs1lib_find_max_char(startptr, endptr);
2463 case PyUnicode_2BYTE_KIND:
2464 return ucs2lib_find_max_char(startptr, endptr);
2465 case PyUnicode_4BYTE_KIND:
2466 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002467 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002468 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002469 }
2470}
2471
Victor Stinner25a4b292011-10-06 12:31:55 +02002472/* Ensure that a string uses the most efficient storage, if it is not the
2473 case: create a new string with of the right kind. Write NULL into *p_unicode
2474 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002475static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002476unicode_adjust_maxchar(PyObject **p_unicode)
2477{
2478 PyObject *unicode, *copy;
2479 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002480 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002481 unsigned int kind;
2482
2483 assert(p_unicode != NULL);
2484 unicode = *p_unicode;
2485 assert(PyUnicode_IS_READY(unicode));
2486 if (PyUnicode_IS_ASCII(unicode))
2487 return;
2488
2489 len = PyUnicode_GET_LENGTH(unicode);
2490 kind = PyUnicode_KIND(unicode);
2491 if (kind == PyUnicode_1BYTE_KIND) {
2492 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002493 max_char = ucs1lib_find_max_char(u, u + len);
2494 if (max_char >= 128)
2495 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002496 }
2497 else if (kind == PyUnicode_2BYTE_KIND) {
2498 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002499 max_char = ucs2lib_find_max_char(u, u + len);
2500 if (max_char >= 256)
2501 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002502 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002503 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002504 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002505 max_char = ucs4lib_find_max_char(u, u + len);
2506 if (max_char >= 0x10000)
2507 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002508 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002509 else
2510 Py_UNREACHABLE();
2511
Victor Stinner25a4b292011-10-06 12:31:55 +02002512 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002513 if (copy != NULL)
2514 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002515 Py_DECREF(unicode);
2516 *p_unicode = copy;
2517}
2518
Victor Stinner034f6cf2011-09-30 02:26:44 +02002519PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002520_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002521{
Victor Stinner87af4f22011-11-21 23:03:47 +01002522 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002523 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002524
Victor Stinner034f6cf2011-09-30 02:26:44 +02002525 if (!PyUnicode_Check(unicode)) {
2526 PyErr_BadInternalCall();
2527 return NULL;
2528 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002529 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002530 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002531
Victor Stinner87af4f22011-11-21 23:03:47 +01002532 length = PyUnicode_GET_LENGTH(unicode);
2533 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002534 if (!copy)
2535 return NULL;
2536 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2537
Christian Heimesf051e432016-09-13 20:22:02 +02002538 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002539 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002540 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002541 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002542}
2543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002544
Victor Stinnerbc603d12011-10-02 01:00:40 +02002545/* Widen Unicode objects to larger buffers. Don't write terminating null
2546 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002547
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002548static void*
2549unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002550{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002551 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002552
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002553 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002554 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002555 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002556 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002557 if (!result)
2558 return PyErr_NoMemory();
2559 assert(skind == PyUnicode_1BYTE_KIND);
2560 _PyUnicode_CONVERT_BYTES(
2561 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002562 (const Py_UCS1 *)data,
2563 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002564 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002565 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002566 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002567 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002568 if (!result)
2569 return PyErr_NoMemory();
2570 if (skind == PyUnicode_2BYTE_KIND) {
2571 _PyUnicode_CONVERT_BYTES(
2572 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002573 (const Py_UCS2 *)data,
2574 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002575 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002576 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002577 else {
2578 assert(skind == PyUnicode_1BYTE_KIND);
2579 _PyUnicode_CONVERT_BYTES(
2580 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002581 (const Py_UCS1 *)data,
2582 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002583 result);
2584 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002585 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002586 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002587 Py_UNREACHABLE();
2588 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002589 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002590}
2591
2592static Py_UCS4*
2593as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2594 int copy_null)
2595{
2596 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002597 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002598 Py_ssize_t len, targetlen;
2599 if (PyUnicode_READY(string) == -1)
2600 return NULL;
2601 kind = PyUnicode_KIND(string);
2602 data = PyUnicode_DATA(string);
2603 len = PyUnicode_GET_LENGTH(string);
2604 targetlen = len;
2605 if (copy_null)
2606 targetlen++;
2607 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002608 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002609 if (!target) {
2610 PyErr_NoMemory();
2611 return NULL;
2612 }
2613 }
2614 else {
2615 if (targetsize < targetlen) {
2616 PyErr_Format(PyExc_SystemError,
2617 "string is longer than the buffer");
2618 if (copy_null && 0 < targetsize)
2619 target[0] = 0;
2620 return NULL;
2621 }
2622 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002623 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002624 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002625 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002626 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002627 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002628 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002629 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2630 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002631 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002632 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002633 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002634 else {
2635 Py_UNREACHABLE();
2636 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002637 if (copy_null)
2638 target[len] = 0;
2639 return target;
2640}
2641
2642Py_UCS4*
2643PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2644 int copy_null)
2645{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002646 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002647 PyErr_BadInternalCall();
2648 return NULL;
2649 }
2650 return as_ucs4(string, target, targetsize, copy_null);
2651}
2652
2653Py_UCS4*
2654PyUnicode_AsUCS4Copy(PyObject *string)
2655{
2656 return as_ucs4(string, NULL, 0, 1);
2657}
2658
Victor Stinner15a11362012-10-06 23:48:20 +02002659/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002660 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2661 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2662#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002663
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002664static int
2665unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2666 Py_ssize_t width, Py_ssize_t precision)
2667{
2668 Py_ssize_t length, fill, arglen;
2669 Py_UCS4 maxchar;
2670
2671 if (PyUnicode_READY(str) == -1)
2672 return -1;
2673
2674 length = PyUnicode_GET_LENGTH(str);
2675 if ((precision == -1 || precision >= length)
2676 && width <= length)
2677 return _PyUnicodeWriter_WriteStr(writer, str);
2678
2679 if (precision != -1)
2680 length = Py_MIN(precision, length);
2681
2682 arglen = Py_MAX(length, width);
2683 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2684 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2685 else
2686 maxchar = writer->maxchar;
2687
2688 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2689 return -1;
2690
2691 if (width > length) {
2692 fill = width - length;
2693 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2694 return -1;
2695 writer->pos += fill;
2696 }
2697
2698 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2699 str, 0, length);
2700 writer->pos += length;
2701 return 0;
2702}
2703
2704static int
Victor Stinner998b8062018-09-12 00:23:25 +02002705unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002706 Py_ssize_t width, Py_ssize_t precision)
2707{
2708 /* UTF-8 */
2709 Py_ssize_t length;
2710 PyObject *unicode;
2711 int res;
2712
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002713 if (precision == -1) {
2714 length = strlen(str);
2715 }
2716 else {
2717 length = 0;
2718 while (length < precision && str[length]) {
2719 length++;
2720 }
2721 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002722 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2723 if (unicode == NULL)
2724 return -1;
2725
2726 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2727 Py_DECREF(unicode);
2728 return res;
2729}
2730
Victor Stinner96865452011-03-01 23:44:09 +00002731static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002732unicode_fromformat_arg(_PyUnicodeWriter *writer,
2733 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002734{
Victor Stinnere215d962012-10-06 23:03:36 +02002735 const char *p;
2736 Py_ssize_t len;
2737 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002738 Py_ssize_t width;
2739 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002740 int longflag;
2741 int longlongflag;
2742 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002743 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002744
2745 p = f;
2746 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002747 zeropad = 0;
2748 if (*f == '0') {
2749 zeropad = 1;
2750 f++;
2751 }
Victor Stinner96865452011-03-01 23:44:09 +00002752
2753 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002754 width = -1;
2755 if (Py_ISDIGIT((unsigned)*f)) {
2756 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002757 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002758 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002759 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002760 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002761 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002762 return NULL;
2763 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002764 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002765 f++;
2766 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002767 }
2768 precision = -1;
2769 if (*f == '.') {
2770 f++;
2771 if (Py_ISDIGIT((unsigned)*f)) {
2772 precision = (*f - '0');
2773 f++;
2774 while (Py_ISDIGIT((unsigned)*f)) {
2775 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2776 PyErr_SetString(PyExc_ValueError,
2777 "precision too big");
2778 return NULL;
2779 }
2780 precision = (precision * 10) + (*f - '0');
2781 f++;
2782 }
2783 }
Victor Stinner96865452011-03-01 23:44:09 +00002784 if (*f == '%') {
2785 /* "%.3%s" => f points to "3" */
2786 f--;
2787 }
2788 }
2789 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002790 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002791 f--;
2792 }
Victor Stinner96865452011-03-01 23:44:09 +00002793
2794 /* Handle %ld, %lu, %lld and %llu. */
2795 longflag = 0;
2796 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002797 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002798 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002799 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002800 longflag = 1;
2801 ++f;
2802 }
Victor Stinner96865452011-03-01 23:44:09 +00002803 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002804 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002805 longlongflag = 1;
2806 f += 2;
2807 }
Victor Stinner96865452011-03-01 23:44:09 +00002808 }
2809 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002810 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002811 size_tflag = 1;
2812 ++f;
2813 }
Victor Stinnere215d962012-10-06 23:03:36 +02002814
2815 if (f[1] == '\0')
2816 writer->overallocate = 0;
2817
2818 switch (*f) {
2819 case 'c':
2820 {
2821 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002822 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002823 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002824 "character argument not in range(0x110000)");
2825 return NULL;
2826 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002827 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002828 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002829 break;
2830 }
2831
2832 case 'i':
2833 case 'd':
2834 case 'u':
2835 case 'x':
2836 {
2837 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002838 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002839 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002840
2841 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002842 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002843 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002844 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002845 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002846 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002847 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002848 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002849 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002850 va_arg(*vargs, size_t));
2851 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002852 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002853 va_arg(*vargs, unsigned int));
2854 }
2855 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002856 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002857 }
2858 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002859 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002860 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002861 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002862 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002863 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002864 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002865 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002866 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002867 va_arg(*vargs, Py_ssize_t));
2868 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002869 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002870 va_arg(*vargs, int));
2871 }
2872 assert(len >= 0);
2873
Victor Stinnere215d962012-10-06 23:03:36 +02002874 if (precision < len)
2875 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002876
2877 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002878 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2879 return NULL;
2880
Victor Stinnere215d962012-10-06 23:03:36 +02002881 if (width > precision) {
2882 Py_UCS4 fillchar;
2883 fill = width - precision;
2884 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002885 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2886 return NULL;
2887 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002888 }
Victor Stinner15a11362012-10-06 23:48:20 +02002889 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002890 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002891 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2892 return NULL;
2893 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002894 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002895
Victor Stinner4a587072013-11-19 12:54:53 +01002896 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2897 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002898 break;
2899 }
2900
2901 case 'p':
2902 {
2903 char number[MAX_LONG_LONG_CHARS];
2904
2905 len = sprintf(number, "%p", va_arg(*vargs, void*));
2906 assert(len >= 0);
2907
2908 /* %p is ill-defined: ensure leading 0x. */
2909 if (number[1] == 'X')
2910 number[1] = 'x';
2911 else if (number[1] != 'x') {
2912 memmove(number + 2, number,
2913 strlen(number) + 1);
2914 number[0] = '0';
2915 number[1] = 'x';
2916 len += 2;
2917 }
2918
Victor Stinner4a587072013-11-19 12:54:53 +01002919 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002920 return NULL;
2921 break;
2922 }
2923
2924 case 's':
2925 {
2926 /* UTF-8 */
2927 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002928 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002929 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002930 break;
2931 }
2932
2933 case 'U':
2934 {
2935 PyObject *obj = va_arg(*vargs, PyObject *);
2936 assert(obj && _PyUnicode_CHECK(obj));
2937
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002938 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002939 return NULL;
2940 break;
2941 }
2942
2943 case 'V':
2944 {
2945 PyObject *obj = va_arg(*vargs, PyObject *);
2946 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002947 if (obj) {
2948 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002949 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002950 return NULL;
2951 }
2952 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002953 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002954 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002955 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002956 }
2957 break;
2958 }
2959
2960 case 'S':
2961 {
2962 PyObject *obj = va_arg(*vargs, PyObject *);
2963 PyObject *str;
2964 assert(obj);
2965 str = PyObject_Str(obj);
2966 if (!str)
2967 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002968 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002969 Py_DECREF(str);
2970 return NULL;
2971 }
2972 Py_DECREF(str);
2973 break;
2974 }
2975
2976 case 'R':
2977 {
2978 PyObject *obj = va_arg(*vargs, PyObject *);
2979 PyObject *repr;
2980 assert(obj);
2981 repr = PyObject_Repr(obj);
2982 if (!repr)
2983 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002984 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002985 Py_DECREF(repr);
2986 return NULL;
2987 }
2988 Py_DECREF(repr);
2989 break;
2990 }
2991
2992 case 'A':
2993 {
2994 PyObject *obj = va_arg(*vargs, PyObject *);
2995 PyObject *ascii;
2996 assert(obj);
2997 ascii = PyObject_ASCII(obj);
2998 if (!ascii)
2999 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003000 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003001 Py_DECREF(ascii);
3002 return NULL;
3003 }
3004 Py_DECREF(ascii);
3005 break;
3006 }
3007
3008 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02003009 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003010 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003011 break;
3012
3013 default:
3014 /* if we stumble upon an unknown formatting code, copy the rest
3015 of the format string to the output string. (we cannot just
3016 skip the code, since there's no way to know what's in the
3017 argument list) */
3018 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01003019 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003020 return NULL;
3021 f = p+len;
3022 return f;
3023 }
3024
3025 f++;
Victor Stinner96865452011-03-01 23:44:09 +00003026 return f;
3027}
3028
Walter Dörwaldd2034312007-05-18 16:29:38 +00003029PyObject *
3030PyUnicode_FromFormatV(const char *format, va_list vargs)
3031{
Victor Stinnere215d962012-10-06 23:03:36 +02003032 va_list vargs2;
3033 const char *f;
3034 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003035
Victor Stinner8f674cc2013-04-17 23:02:17 +02003036 _PyUnicodeWriter_Init(&writer);
3037 writer.min_length = strlen(format) + 100;
3038 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003039
Benjamin Peterson0c212142016-09-20 20:39:33 -07003040 // Copy varags to be able to pass a reference to a subfunction.
3041 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003042
3043 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003044 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003045 f = unicode_fromformat_arg(&writer, f, &vargs2);
3046 if (f == NULL)
3047 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003048 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003049 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003050 const char *p;
3051 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003052
Victor Stinnere215d962012-10-06 23:03:36 +02003053 p = f;
3054 do
3055 {
3056 if ((unsigned char)*p > 127) {
3057 PyErr_Format(PyExc_ValueError,
3058 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3059 "string, got a non-ASCII byte: 0x%02x",
3060 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003061 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003062 }
3063 p++;
3064 }
3065 while (*p != '\0' && *p != '%');
3066 len = p - f;
3067
3068 if (*p == '\0')
3069 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003070
3071 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003072 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003073
3074 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003075 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003076 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003077 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003078 return _PyUnicodeWriter_Finish(&writer);
3079
3080 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003081 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003082 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003083 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003084}
3085
Walter Dörwaldd2034312007-05-18 16:29:38 +00003086PyObject *
3087PyUnicode_FromFormat(const char *format, ...)
3088{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003089 PyObject* ret;
3090 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003091
3092#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003093 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003094#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003095 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003096#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003097 ret = PyUnicode_FromFormatV(format, vargs);
3098 va_end(vargs);
3099 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003100}
3101
Serhiy Storchakac46db922018-10-23 22:58:24 +03003102static Py_ssize_t
3103unicode_get_widechar_size(PyObject *unicode)
3104{
3105 Py_ssize_t res;
3106
3107 assert(unicode != NULL);
3108 assert(_PyUnicode_CHECK(unicode));
3109
3110 if (_PyUnicode_WSTR(unicode) != NULL) {
3111 return PyUnicode_WSTR_LENGTH(unicode);
3112 }
3113 assert(PyUnicode_IS_READY(unicode));
3114
3115 res = _PyUnicode_LENGTH(unicode);
3116#if SIZEOF_WCHAR_T == 2
3117 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3118 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3119 const Py_UCS4 *end = s + res;
3120 for (; s < end; ++s) {
3121 if (*s > 0xFFFF) {
3122 ++res;
3123 }
3124 }
3125 }
3126#endif
3127 return res;
3128}
3129
3130static void
3131unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3132{
3133 const wchar_t *wstr;
3134
3135 assert(unicode != NULL);
3136 assert(_PyUnicode_CHECK(unicode));
3137
3138 wstr = _PyUnicode_WSTR(unicode);
3139 if (wstr != NULL) {
3140 memcpy(w, wstr, size * sizeof(wchar_t));
3141 return;
3142 }
3143 assert(PyUnicode_IS_READY(unicode));
3144
3145 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3146 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3147 for (; size--; ++s, ++w) {
3148 *w = *s;
3149 }
3150 }
3151 else {
3152#if SIZEOF_WCHAR_T == 4
3153 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3154 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3155 for (; size--; ++s, ++w) {
3156 *w = *s;
3157 }
3158#else
3159 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3160 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3161 for (; size--; ++s, ++w) {
3162 Py_UCS4 ch = *s;
3163 if (ch > 0xFFFF) {
3164 assert(ch <= MAX_UNICODE);
3165 /* encode surrogate pair in this case */
3166 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3167 if (!size--)
3168 break;
3169 *w = Py_UNICODE_LOW_SURROGATE(ch);
3170 }
3171 else {
3172 *w = ch;
3173 }
3174 }
3175#endif
3176 }
3177}
3178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003179#ifdef HAVE_WCHAR_H
3180
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003181/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003182
Victor Stinnerd88d9832011-09-06 02:00:05 +02003183 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003184 character) required to convert the unicode object. Ignore size argument.
3185
Victor Stinnerd88d9832011-09-06 02:00:05 +02003186 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003187 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003188 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003189Py_ssize_t
3190PyUnicode_AsWideChar(PyObject *unicode,
3191 wchar_t *w,
3192 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003193{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003194 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003195
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003196 if (unicode == NULL) {
3197 PyErr_BadInternalCall();
3198 return -1;
3199 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003200 if (!PyUnicode_Check(unicode)) {
3201 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003202 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003203 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003204
3205 res = unicode_get_widechar_size(unicode);
3206 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003207 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003208 }
3209
3210 if (size > res) {
3211 size = res + 1;
3212 }
3213 else {
3214 res = size;
3215 }
3216 unicode_copy_as_widechar(unicode, w, size);
3217 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003218}
3219
Victor Stinner137c34c2010-09-29 10:25:54 +00003220wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003221PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003222 Py_ssize_t *size)
3223{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003224 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003225 Py_ssize_t buflen;
3226
3227 if (unicode == NULL) {
3228 PyErr_BadInternalCall();
3229 return NULL;
3230 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003231 if (!PyUnicode_Check(unicode)) {
3232 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003233 return NULL;
3234 }
3235
Serhiy Storchakac46db922018-10-23 22:58:24 +03003236 buflen = unicode_get_widechar_size(unicode);
3237 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003238 if (buffer == NULL) {
3239 PyErr_NoMemory();
3240 return NULL;
3241 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003242 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3243 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003244 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003245 }
3246 else if (wcslen(buffer) != (size_t)buflen) {
3247 PyMem_FREE(buffer);
3248 PyErr_SetString(PyExc_ValueError,
3249 "embedded null character");
3250 return NULL;
3251 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003252 return buffer;
3253}
3254
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003255#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256
Alexander Belopolsky40018472011-02-26 01:02:56 +00003257PyObject *
3258PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003259{
Victor Stinner8faf8212011-12-08 22:14:11 +01003260 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003261 PyErr_SetString(PyExc_ValueError,
3262 "chr() arg not in range(0x110000)");
3263 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003264 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003265
Victor Stinner985a82a2014-01-03 12:53:47 +01003266 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003267}
3268
Alexander Belopolsky40018472011-02-26 01:02:56 +00003269PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003270PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003272 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003273 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003274 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003275 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003276 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003277 Py_INCREF(obj);
3278 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003279 }
3280 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003281 /* For a Unicode subtype that's not a Unicode object,
3282 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003283 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003284 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003285 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003286 "Can't convert '%.100s' object to str implicitly",
3287 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003288 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003289}
3290
Alexander Belopolsky40018472011-02-26 01:02:56 +00003291PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003292PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003293 const char *encoding,
3294 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003295{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003296 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003297 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003298
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003300 PyErr_BadInternalCall();
3301 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003302 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003303
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003304 /* Decoding bytes objects is the most common case and should be fast */
3305 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003306 if (PyBytes_GET_SIZE(obj) == 0) {
3307 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3308 return NULL;
3309 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003310 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003311 }
3312 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003313 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3314 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003315 }
3316
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003317 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003318 PyErr_SetString(PyExc_TypeError,
3319 "decoding str is not supported");
3320 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003321 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003322
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003323 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3324 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3325 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003326 "decoding to str: need a bytes-like object, %.80s found",
3327 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003328 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003329 }
Tim Petersced69f82003-09-16 20:30:58 +00003330
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003331 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003332 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003333 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3334 return NULL;
3335 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003336 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003338
Serhiy Storchaka05997252013-01-26 12:14:02 +02003339 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003340 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003341 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003342}
3343
Victor Stinnerebe17e02016-10-12 13:57:45 +02003344/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3345 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3346 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003347int
3348_Py_normalize_encoding(const char *encoding,
3349 char *lower,
3350 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003351{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003352 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003353 char *l;
3354 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003355 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003356
Victor Stinner942889a2016-09-05 15:40:10 -07003357 assert(encoding != NULL);
3358
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003359 e = encoding;
3360 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003361 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003362 punct = 0;
3363 while (1) {
3364 char c = *e;
3365 if (c == 0) {
3366 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003367 }
Victor Stinner942889a2016-09-05 15:40:10 -07003368
3369 if (Py_ISALNUM(c) || c == '.') {
3370 if (punct && l != lower) {
3371 if (l == l_end) {
3372 return 0;
3373 }
3374 *l++ = '_';
3375 }
3376 punct = 0;
3377
3378 if (l == l_end) {
3379 return 0;
3380 }
3381 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003382 }
3383 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003384 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003385 }
Victor Stinner942889a2016-09-05 15:40:10 -07003386
3387 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003388 }
3389 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003390 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003391}
3392
Alexander Belopolsky40018472011-02-26 01:02:56 +00003393PyObject *
3394PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003395 Py_ssize_t size,
3396 const char *encoding,
3397 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003398{
3399 PyObject *buffer = NULL, *unicode;
3400 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003401 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3402
Victor Stinner22eb6892019-06-26 00:51:05 +02003403 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3404 return NULL;
3405 }
3406
Victor Stinnered076ed2019-06-26 01:49:32 +02003407 if (size == 0) {
3408 _Py_RETURN_UNICODE_EMPTY();
3409 }
3410
Victor Stinner942889a2016-09-05 15:40:10 -07003411 if (encoding == NULL) {
3412 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3413 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003414
Fred Drakee4315f52000-05-09 19:53:39 +00003415 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003416 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3417 char *lower = buflower;
3418
3419 /* Fast paths */
3420 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3421 lower += 3;
3422 if (*lower == '_') {
3423 /* Match "utf8" and "utf_8" */
3424 lower++;
3425 }
3426
3427 if (lower[0] == '8' && lower[1] == 0) {
3428 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3429 }
3430 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3431 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3432 }
3433 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3434 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3435 }
3436 }
3437 else {
3438 if (strcmp(lower, "ascii") == 0
3439 || strcmp(lower, "us_ascii") == 0) {
3440 return PyUnicode_DecodeASCII(s, size, errors);
3441 }
Steve Dowercc16be82016-09-08 10:35:16 -07003442 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003443 else if (strcmp(lower, "mbcs") == 0) {
3444 return PyUnicode_DecodeMBCS(s, size, errors);
3445 }
3446 #endif
3447 else if (strcmp(lower, "latin1") == 0
3448 || strcmp(lower, "latin_1") == 0
3449 || strcmp(lower, "iso_8859_1") == 0
3450 || strcmp(lower, "iso8859_1") == 0) {
3451 return PyUnicode_DecodeLatin1(s, size, errors);
3452 }
3453 }
Victor Stinner37296e82010-06-10 13:36:23 +00003454 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003455
3456 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003457 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003458 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003459 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003460 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003461 if (buffer == NULL)
3462 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003463 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003464 if (unicode == NULL)
3465 goto onError;
3466 if (!PyUnicode_Check(unicode)) {
3467 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003468 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003469 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003470 encoding,
3471 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003472 Py_DECREF(unicode);
3473 goto onError;
3474 }
3475 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003476 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003477
Benjamin Peterson29060642009-01-31 22:14:21 +00003478 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003479 Py_XDECREF(buffer);
3480 return NULL;
3481}
3482
Alexander Belopolsky40018472011-02-26 01:02:56 +00003483PyObject *
3484PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003485 const char *encoding,
3486 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003487{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003488 if (!PyUnicode_Check(unicode)) {
3489 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003490 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003491 }
3492
Serhiy Storchaka00939072016-10-27 21:05:49 +03003493 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3494 "PyUnicode_AsDecodedObject() is deprecated; "
3495 "use PyCodec_Decode() to decode from str", 1) < 0)
3496 return NULL;
3497
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003498 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003499 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003500
3501 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003502 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003503}
3504
Alexander Belopolsky40018472011-02-26 01:02:56 +00003505PyObject *
3506PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003507 const char *encoding,
3508 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003509{
3510 PyObject *v;
3511
3512 if (!PyUnicode_Check(unicode)) {
3513 PyErr_BadArgument();
3514 goto onError;
3515 }
3516
Serhiy Storchaka00939072016-10-27 21:05:49 +03003517 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3518 "PyUnicode_AsDecodedUnicode() is deprecated; "
3519 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3520 return NULL;
3521
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003522 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003523 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003524
3525 /* Decode via the codec registry */
3526 v = PyCodec_Decode(unicode, encoding, errors);
3527 if (v == NULL)
3528 goto onError;
3529 if (!PyUnicode_Check(v)) {
3530 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003531 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003532 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003533 encoding,
3534 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003535 Py_DECREF(v);
3536 goto onError;
3537 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003538 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003539
Benjamin Peterson29060642009-01-31 22:14:21 +00003540 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003541 return NULL;
3542}
3543
Alexander Belopolsky40018472011-02-26 01:02:56 +00003544PyObject *
3545PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003546 Py_ssize_t size,
3547 const char *encoding,
3548 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003549{
3550 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003551
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003552 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003553 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003554 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003555 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3556 Py_DECREF(unicode);
3557 return v;
3558}
3559
Alexander Belopolsky40018472011-02-26 01:02:56 +00003560PyObject *
3561PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003562 const char *encoding,
3563 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003564{
3565 PyObject *v;
3566
3567 if (!PyUnicode_Check(unicode)) {
3568 PyErr_BadArgument();
3569 goto onError;
3570 }
3571
Serhiy Storchaka00939072016-10-27 21:05:49 +03003572 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3573 "PyUnicode_AsEncodedObject() is deprecated; "
3574 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3575 "or PyCodec_Encode() for generic encoding", 1) < 0)
3576 return NULL;
3577
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003578 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003579 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003580
3581 /* Encode via the codec registry */
3582 v = PyCodec_Encode(unicode, encoding, errors);
3583 if (v == NULL)
3584 goto onError;
3585 return v;
3586
Benjamin Peterson29060642009-01-31 22:14:21 +00003587 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003588 return NULL;
3589}
3590
Victor Stinner1b579672011-12-17 05:47:23 +01003591
Victor Stinner2cba6b82018-01-10 22:46:15 +01003592static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003593unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003594 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003595{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003596 Py_ssize_t wlen;
3597 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3598 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003599 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003600 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003601
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003602 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003603 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003604 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003605 return NULL;
3606 }
3607
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003608 char *str;
3609 size_t error_pos;
3610 const char *reason;
3611 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003612 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003613 PyMem_Free(wstr);
3614
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003615 if (res != 0) {
3616 if (res == -2) {
3617 PyObject *exc;
3618 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3619 "locale", unicode,
3620 (Py_ssize_t)error_pos,
3621 (Py_ssize_t)(error_pos+1),
3622 reason);
3623 if (exc != NULL) {
3624 PyCodec_StrictErrors(exc);
3625 Py_DECREF(exc);
3626 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003627 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003628 else if (res == -3) {
3629 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3630 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003631 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003632 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003633 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003634 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003635 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003636
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003637 PyObject *bytes = PyBytes_FromString(str);
3638 PyMem_RawFree(str);
3639 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003640}
3641
Victor Stinnerad158722010-10-27 00:25:46 +00003642PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003643PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3644{
Victor Stinner709d23d2019-05-02 14:56:30 -04003645 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3646 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003647}
3648
3649PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003650PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003651{
Victor Stinner81a7be32020-04-14 15:14:01 +02003652 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003653 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3654 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003655 return unicode_encode_utf8(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003656 fs_codec->error_handler,
3657 fs_codec->errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003658 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003659#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003660 else if (fs_codec->encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003661 return PyUnicode_AsEncodedString(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003662 fs_codec->encoding,
3663 fs_codec->errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003664 }
Victor Stinnerad158722010-10-27 00:25:46 +00003665#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003666 else {
3667 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3668 machinery is not ready and so cannot be used:
3669 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003670 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3671 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003672 assert(filesystem_errors != NULL);
3673 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3674 assert(errors != _Py_ERROR_UNKNOWN);
3675#ifdef _Py_FORCE_UTF8_FS_ENCODING
3676 return unicode_encode_utf8(unicode, errors, NULL);
3677#else
3678 return unicode_encode_locale(unicode, errors, 0);
3679#endif
3680 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003681}
3682
Alexander Belopolsky40018472011-02-26 01:02:56 +00003683PyObject *
3684PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003685 const char *encoding,
3686 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003687{
3688 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003689 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003690
Guido van Rossumd57fd912000-03-10 22:53:23 +00003691 if (!PyUnicode_Check(unicode)) {
3692 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003693 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003694 }
Fred Drakee4315f52000-05-09 19:53:39 +00003695
Victor Stinner22eb6892019-06-26 00:51:05 +02003696 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3697 return NULL;
3698 }
3699
Victor Stinner942889a2016-09-05 15:40:10 -07003700 if (encoding == NULL) {
3701 return _PyUnicode_AsUTF8String(unicode, errors);
3702 }
3703
Fred Drakee4315f52000-05-09 19:53:39 +00003704 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003705 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3706 char *lower = buflower;
3707
3708 /* Fast paths */
3709 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3710 lower += 3;
3711 if (*lower == '_') {
3712 /* Match "utf8" and "utf_8" */
3713 lower++;
3714 }
3715
3716 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003717 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003718 }
3719 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3720 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3721 }
3722 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3723 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3724 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003725 }
Victor Stinner942889a2016-09-05 15:40:10 -07003726 else {
3727 if (strcmp(lower, "ascii") == 0
3728 || strcmp(lower, "us_ascii") == 0) {
3729 return _PyUnicode_AsASCIIString(unicode, errors);
3730 }
Steve Dowercc16be82016-09-08 10:35:16 -07003731#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003732 else if (strcmp(lower, "mbcs") == 0) {
3733 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3734 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003735#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003736 else if (strcmp(lower, "latin1") == 0 ||
3737 strcmp(lower, "latin_1") == 0 ||
3738 strcmp(lower, "iso_8859_1") == 0 ||
3739 strcmp(lower, "iso8859_1") == 0) {
3740 return _PyUnicode_AsLatin1String(unicode, errors);
3741 }
3742 }
Victor Stinner37296e82010-06-10 13:36:23 +00003743 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744
3745 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003746 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003747 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003748 return NULL;
3749
3750 /* The normal path */
3751 if (PyBytes_Check(v))
3752 return v;
3753
3754 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003755 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003756 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003757 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003758
3759 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003760 "encoder %s returned bytearray instead of bytes; "
3761 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003762 encoding);
3763 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003764 Py_DECREF(v);
3765 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003766 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003767
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003768 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3769 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003770 Py_DECREF(v);
3771 return b;
3772 }
3773
3774 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003775 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003776 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003777 encoding,
3778 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003779 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003780 return NULL;
3781}
3782
Alexander Belopolsky40018472011-02-26 01:02:56 +00003783PyObject *
3784PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003785 const char *encoding,
3786 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003787{
3788 PyObject *v;
3789
3790 if (!PyUnicode_Check(unicode)) {
3791 PyErr_BadArgument();
3792 goto onError;
3793 }
3794
Serhiy Storchaka00939072016-10-27 21:05:49 +03003795 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3796 "PyUnicode_AsEncodedUnicode() is deprecated; "
3797 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3798 return NULL;
3799
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003800 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003801 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003802
3803 /* Encode via the codec registry */
3804 v = PyCodec_Encode(unicode, encoding, errors);
3805 if (v == NULL)
3806 goto onError;
3807 if (!PyUnicode_Check(v)) {
3808 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003809 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003810 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003811 encoding,
3812 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003813 Py_DECREF(v);
3814 goto onError;
3815 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003817
Benjamin Peterson29060642009-01-31 22:14:21 +00003818 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003819 return NULL;
3820}
3821
Victor Stinner2cba6b82018-01-10 22:46:15 +01003822static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003823unicode_decode_locale(const char *str, Py_ssize_t len,
3824 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003825{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003826 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3827 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003828 return NULL;
3829 }
3830
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003831 wchar_t *wstr;
3832 size_t wlen;
3833 const char *reason;
3834 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003835 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003836 if (res != 0) {
3837 if (res == -2) {
3838 PyObject *exc;
3839 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3840 "locale", str, len,
3841 (Py_ssize_t)wlen,
3842 (Py_ssize_t)(wlen + 1),
3843 reason);
3844 if (exc != NULL) {
3845 PyCodec_StrictErrors(exc);
3846 Py_DECREF(exc);
3847 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003848 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003849 else if (res == -3) {
3850 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3851 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003852 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003853 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003854 }
Victor Stinner2f197072011-12-17 07:08:30 +01003855 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003856 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003857
3858 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3859 PyMem_RawFree(wstr);
3860 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003861}
3862
3863PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003864PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3865 const char *errors)
3866{
Victor Stinner709d23d2019-05-02 14:56:30 -04003867 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3868 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003869}
3870
3871PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003872PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003873{
3874 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003875 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3876 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003877}
3878
3879
3880PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003881PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003882 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003883 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3884}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003885
Christian Heimes5894ba72007-11-04 11:43:14 +00003886PyObject*
3887PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3888{
Victor Stinner81a7be32020-04-14 15:14:01 +02003889 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003890 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3891 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003892 return unicode_decode_utf8(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02003893 fs_codec->error_handler,
3894 fs_codec->errors,
Victor Stinner709d23d2019-05-02 14:56:30 -04003895 NULL);
3896 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003897#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003898 else if (fs_codec->encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003899 return PyUnicode_Decode(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02003900 fs_codec->encoding,
3901 fs_codec->errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003902 }
Victor Stinnerad158722010-10-27 00:25:46 +00003903#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003904 else {
3905 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3906 machinery is not ready and so cannot be used:
3907 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003908 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3909 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003910 assert(filesystem_errors != NULL);
3911 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3912 assert(errors != _Py_ERROR_UNKNOWN);
3913#ifdef _Py_FORCE_UTF8_FS_ENCODING
3914 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3915#else
3916 return unicode_decode_locale(s, size, errors, 0);
3917#endif
3918 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003919}
3920
Martin v. Löwis011e8422009-05-05 04:43:17 +00003921
3922int
3923PyUnicode_FSConverter(PyObject* arg, void* addr)
3924{
Brett Cannonec6ce872016-09-06 15:50:29 -07003925 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003926 PyObject *output = NULL;
3927 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03003928 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003929 if (arg == NULL) {
3930 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003931 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003932 return 1;
3933 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003934 path = PyOS_FSPath(arg);
3935 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003936 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003937 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003938 if (PyBytes_Check(path)) {
3939 output = path;
3940 }
3941 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3942 output = PyUnicode_EncodeFSDefault(path);
3943 Py_DECREF(path);
3944 if (!output) {
3945 return 0;
3946 }
3947 assert(PyBytes_Check(output));
3948 }
3949
Victor Stinner0ea2a462010-04-30 00:22:08 +00003950 size = PyBytes_GET_SIZE(output);
3951 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003952 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003953 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003954 Py_DECREF(output);
3955 return 0;
3956 }
3957 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003958 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003959}
3960
3961
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003962int
3963PyUnicode_FSDecoder(PyObject* arg, void* addr)
3964{
Brett Cannona5711202016-09-06 19:36:01 -07003965 int is_buffer = 0;
3966 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003967 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003968 if (arg == NULL) {
3969 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003970 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003971 return 1;
3972 }
Brett Cannona5711202016-09-06 19:36:01 -07003973
3974 is_buffer = PyObject_CheckBuffer(arg);
3975 if (!is_buffer) {
3976 path = PyOS_FSPath(arg);
3977 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003978 return 0;
3979 }
Brett Cannona5711202016-09-06 19:36:01 -07003980 }
3981 else {
3982 path = arg;
3983 Py_INCREF(arg);
3984 }
3985
3986 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003987 output = path;
3988 }
3989 else if (PyBytes_Check(path) || is_buffer) {
3990 PyObject *path_bytes = NULL;
3991
3992 if (!PyBytes_Check(path) &&
3993 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003994 "path should be string, bytes, or os.PathLike, not %.200s",
3995 Py_TYPE(arg)->tp_name)) {
3996 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003997 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003998 }
3999 path_bytes = PyBytes_FromObject(path);
4000 Py_DECREF(path);
4001 if (!path_bytes) {
4002 return 0;
4003 }
4004 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4005 PyBytes_GET_SIZE(path_bytes));
4006 Py_DECREF(path_bytes);
4007 if (!output) {
4008 return 0;
4009 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004010 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004011 else {
4012 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02004013 "path should be string, bytes, or os.PathLike, not %.200s",
4014 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07004015 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004016 return 0;
4017 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004018 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02004019 Py_DECREF(output);
4020 return 0;
4021 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004022 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02004023 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004024 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004025 Py_DECREF(output);
4026 return 0;
4027 }
4028 *(PyObject**)addr = output;
4029 return Py_CLEANUP_SUPPORTED;
4030}
4031
4032
Inada Naoki02a4d572020-02-27 13:48:59 +09004033static int unicode_fill_utf8(PyObject *unicode);
4034
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004035const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004036PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004037{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004038 if (!PyUnicode_Check(unicode)) {
4039 PyErr_BadArgument();
4040 return NULL;
4041 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004042 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004043 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004044
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004045 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004046 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004047 return NULL;
4048 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004049 }
4050
4051 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004052 *psize = PyUnicode_UTF8_LENGTH(unicode);
4053 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004054}
4055
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004056const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004057PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004058{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004059 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4060}
4061
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004062Py_UNICODE *
4063PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4064{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004065 if (!PyUnicode_Check(unicode)) {
4066 PyErr_BadArgument();
4067 return NULL;
4068 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004069 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4070 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004071 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004072 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004073 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004074
Serhiy Storchakac46db922018-10-23 22:58:24 +03004075 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4076 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4077 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004078 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004079 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004080 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4081 if (w == NULL) {
4082 PyErr_NoMemory();
4083 return NULL;
4084 }
4085 unicode_copy_as_widechar(unicode, w, wlen + 1);
4086 _PyUnicode_WSTR(unicode) = w;
4087 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4088 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004089 }
4090 }
4091 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004092 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004093 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004094}
4095
Alexander Belopolsky40018472011-02-26 01:02:56 +00004096Py_UNICODE *
4097PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004098{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004099 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004100}
4101
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004102const Py_UNICODE *
4103_PyUnicode_AsUnicode(PyObject *unicode)
4104{
4105 Py_ssize_t size;
4106 const Py_UNICODE *wstr;
4107
4108 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4109 if (wstr && wcslen(wstr) != (size_t)size) {
4110 PyErr_SetString(PyExc_ValueError, "embedded null character");
4111 return NULL;
4112 }
4113 return wstr;
4114}
4115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004116
Alexander Belopolsky40018472011-02-26 01:02:56 +00004117Py_ssize_t
4118PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004119{
4120 if (!PyUnicode_Check(unicode)) {
4121 PyErr_BadArgument();
4122 goto onError;
4123 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004124 if (_PyUnicode_WSTR(unicode) == NULL) {
4125 if (PyUnicode_AsUnicode(unicode) == NULL)
4126 goto onError;
4127 }
4128 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004129
Benjamin Peterson29060642009-01-31 22:14:21 +00004130 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131 return -1;
4132}
4133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004134Py_ssize_t
4135PyUnicode_GetLength(PyObject *unicode)
4136{
Victor Stinner07621332012-06-16 04:53:46 +02004137 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004138 PyErr_BadArgument();
4139 return -1;
4140 }
Victor Stinner07621332012-06-16 04:53:46 +02004141 if (PyUnicode_READY(unicode) == -1)
4142 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004143 return PyUnicode_GET_LENGTH(unicode);
4144}
4145
4146Py_UCS4
4147PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4148{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004149 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004150 int kind;
4151
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004152 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004153 PyErr_BadArgument();
4154 return (Py_UCS4)-1;
4155 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004156 if (PyUnicode_READY(unicode) == -1) {
4157 return (Py_UCS4)-1;
4158 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004159 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004160 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004161 return (Py_UCS4)-1;
4162 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004163 data = PyUnicode_DATA(unicode);
4164 kind = PyUnicode_KIND(unicode);
4165 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004166}
4167
4168int
4169PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4170{
4171 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004172 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004173 return -1;
4174 }
Victor Stinner488fa492011-12-12 00:01:39 +01004175 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004176 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004177 PyErr_SetString(PyExc_IndexError, "string index out of range");
4178 return -1;
4179 }
Victor Stinner488fa492011-12-12 00:01:39 +01004180 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004181 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004182 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4183 PyErr_SetString(PyExc_ValueError, "character out of range");
4184 return -1;
4185 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004186 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4187 index, ch);
4188 return 0;
4189}
4190
Alexander Belopolsky40018472011-02-26 01:02:56 +00004191const char *
4192PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004193{
Victor Stinner42cb4622010-09-01 19:39:01 +00004194 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004195}
4196
Victor Stinner554f3f02010-06-16 23:33:54 +00004197/* create or adjust a UnicodeDecodeError */
4198static void
4199make_decode_exception(PyObject **exceptionObject,
4200 const char *encoding,
4201 const char *input, Py_ssize_t length,
4202 Py_ssize_t startpos, Py_ssize_t endpos,
4203 const char *reason)
4204{
4205 if (*exceptionObject == NULL) {
4206 *exceptionObject = PyUnicodeDecodeError_Create(
4207 encoding, input, length, startpos, endpos, reason);
4208 }
4209 else {
4210 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4211 goto onError;
4212 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4213 goto onError;
4214 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4215 goto onError;
4216 }
4217 return;
4218
4219onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004220 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004221}
4222
Steve Dowercc16be82016-09-08 10:35:16 -07004223#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004224static int
4225widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4226{
4227 if (newsize > *size) {
4228 wchar_t *newbuf = *buf;
4229 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4230 PyErr_NoMemory();
4231 return -1;
4232 }
4233 *buf = newbuf;
4234 }
4235 *size = newsize;
4236 return 0;
4237}
4238
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004239/* error handling callback helper:
4240 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004241 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004242 and adjust various state variables.
4243 return 0 on success, -1 on error
4244*/
4245
Alexander Belopolsky40018472011-02-26 01:02:56 +00004246static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004247unicode_decode_call_errorhandler_wchar(
4248 const char *errors, PyObject **errorHandler,
4249 const char *encoding, const char *reason,
4250 const char **input, const char **inend, Py_ssize_t *startinpos,
4251 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004252 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004253{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004254 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004255
4256 PyObject *restuple = NULL;
4257 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004258 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004259 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004260 Py_ssize_t requiredsize;
4261 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004262 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004263 wchar_t *repwstr;
4264 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004265
4266 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004267 *errorHandler = PyCodec_LookupError(errors);
4268 if (*errorHandler == NULL)
4269 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004270 }
4271
Victor Stinner554f3f02010-06-16 23:33:54 +00004272 make_decode_exception(exceptionObject,
4273 encoding,
4274 *input, *inend - *input,
4275 *startinpos, *endinpos,
4276 reason);
4277 if (*exceptionObject == NULL)
4278 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004279
Petr Viktorinffd97532020-02-11 17:46:57 +01004280 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004281 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004282 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004283 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004284 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004285 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004287 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004288 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004289
4290 /* Copy back the bytes variables, which might have been modified by the
4291 callback */
4292 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4293 if (!inputobj)
4294 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004295 *input = PyBytes_AS_STRING(inputobj);
4296 insize = PyBytes_GET_SIZE(inputobj);
4297 *inend = *input + insize;
4298 /* we can DECREF safely, as the exception has another reference,
4299 so the object won't go away. */
4300 Py_DECREF(inputobj);
4301
4302 if (newpos<0)
4303 newpos = insize+newpos;
4304 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004305 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004306 goto onError;
4307 }
4308
4309 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4310 if (repwstr == NULL)
4311 goto onError;
4312 /* need more space? (at least enough for what we
4313 have+the replacement+the rest of the string (starting
4314 at the new input position), so we won't have to check space
4315 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004316 requiredsize = *outpos;
4317 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4318 goto overflow;
4319 requiredsize += repwlen;
4320 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4321 goto overflow;
4322 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004323 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004324 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004325 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004326 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004327 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004328 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004329 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004330 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004331 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004332 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004333 *endinpos = newpos;
4334 *inptr = *input + newpos;
4335
4336 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004337 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004338 return 0;
4339
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004340 overflow:
4341 PyErr_SetString(PyExc_OverflowError,
4342 "decoded result is too long for a Python string");
4343
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004344 onError:
4345 Py_XDECREF(restuple);
4346 return -1;
4347}
Steve Dowercc16be82016-09-08 10:35:16 -07004348#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004349
4350static int
4351unicode_decode_call_errorhandler_writer(
4352 const char *errors, PyObject **errorHandler,
4353 const char *encoding, const char *reason,
4354 const char **input, const char **inend, Py_ssize_t *startinpos,
4355 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4356 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4357{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004358 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004359
4360 PyObject *restuple = NULL;
4361 PyObject *repunicode = NULL;
4362 Py_ssize_t insize;
4363 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004364 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004365 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004366 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004367 int need_to_grow = 0;
4368 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004369
4370 if (*errorHandler == NULL) {
4371 *errorHandler = PyCodec_LookupError(errors);
4372 if (*errorHandler == NULL)
4373 goto onError;
4374 }
4375
4376 make_decode_exception(exceptionObject,
4377 encoding,
4378 *input, *inend - *input,
4379 *startinpos, *endinpos,
4380 reason);
4381 if (*exceptionObject == NULL)
4382 goto onError;
4383
Petr Viktorinffd97532020-02-11 17:46:57 +01004384 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004385 if (restuple == NULL)
4386 goto onError;
4387 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004388 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004389 goto onError;
4390 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004391 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004392 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004393
4394 /* Copy back the bytes variables, which might have been modified by the
4395 callback */
4396 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4397 if (!inputobj)
4398 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004399 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004400 *input = PyBytes_AS_STRING(inputobj);
4401 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004402 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004403 /* we can DECREF safely, as the exception has another reference,
4404 so the object won't go away. */
4405 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004406
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004408 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004409 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004410 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004411 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004412 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004413
Victor Stinner170ca6f2013-04-18 00:25:28 +02004414 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004415 if (replen > 1) {
4416 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004417 need_to_grow = 1;
4418 }
4419 new_inptr = *input + newpos;
4420 if (*inend - new_inptr > remain) {
4421 /* We don't know the decoding algorithm here so we make the worst
4422 assumption that one byte decodes to one unicode character.
4423 If unfortunately one byte could decode to more unicode characters,
4424 the decoder may write out-of-bound then. Is it possible for the
4425 algorithms using this function? */
4426 writer->min_length += *inend - new_inptr - remain;
4427 need_to_grow = 1;
4428 }
4429 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004430 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004431 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004432 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4433 goto onError;
4434 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004435 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004436 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004437
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004438 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004439 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004440
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004441 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004442 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004443 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004444
Benjamin Peterson29060642009-01-31 22:14:21 +00004445 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004447 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004448}
4449
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004450/* --- UTF-7 Codec -------------------------------------------------------- */
4451
Antoine Pitrou244651a2009-05-04 18:56:13 +00004452/* See RFC2152 for details. We encode conservatively and decode liberally. */
4453
4454/* Three simple macros defining base-64. */
4455
4456/* Is c a base-64 character? */
4457
4458#define IS_BASE64(c) \
4459 (((c) >= 'A' && (c) <= 'Z') || \
4460 ((c) >= 'a' && (c) <= 'z') || \
4461 ((c) >= '0' && (c) <= '9') || \
4462 (c) == '+' || (c) == '/')
4463
4464/* given that c is a base-64 character, what is its base-64 value? */
4465
4466#define FROM_BASE64(c) \
4467 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4468 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4469 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4470 (c) == '+' ? 62 : 63)
4471
4472/* What is the base-64 character of the bottom 6 bits of n? */
4473
4474#define TO_BASE64(n) \
4475 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4476
4477/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4478 * decoded as itself. We are permissive on decoding; the only ASCII
4479 * byte not decoding to itself is the + which begins a base64
4480 * string. */
4481
4482#define DECODE_DIRECT(c) \
4483 ((c) <= 127 && (c) != '+')
4484
4485/* The UTF-7 encoder treats ASCII characters differently according to
4486 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4487 * the above). See RFC2152. This array identifies these different
4488 * sets:
4489 * 0 : "Set D"
4490 * alphanumeric and '(),-./:?
4491 * 1 : "Set O"
4492 * !"#$%&*;<=>@[]^_`{|}
4493 * 2 : "whitespace"
4494 * ht nl cr sp
4495 * 3 : special (must be base64 encoded)
4496 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4497 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004498
Tim Petersced69f82003-09-16 20:30:58 +00004499static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004500char utf7_category[128] = {
4501/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4502 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4503/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4504 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4505/* sp ! " # $ % & ' ( ) * + , - . / */
4506 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4507/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4508 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4509/* @ A B C D E F G H I J K L M N O */
4510 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4511/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4512 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4513/* ` a b c d e f g h i j k l m n o */
4514 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4515/* p q r s t u v w x y z { | } ~ del */
4516 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004517};
4518
Antoine Pitrou244651a2009-05-04 18:56:13 +00004519/* ENCODE_DIRECT: this character should be encoded as itself. The
4520 * answer depends on whether we are encoding set O as itself, and also
4521 * on whether we are encoding whitespace as itself. RFC2152 makes it
4522 * clear that the answers to these questions vary between
4523 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004524
Antoine Pitrou244651a2009-05-04 18:56:13 +00004525#define ENCODE_DIRECT(c, directO, directWS) \
4526 ((c) < 128 && (c) > 0 && \
4527 ((utf7_category[(c)] == 0) || \
4528 (directWS && (utf7_category[(c)] == 2)) || \
4529 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004530
Alexander Belopolsky40018472011-02-26 01:02:56 +00004531PyObject *
4532PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004533 Py_ssize_t size,
4534 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004536 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4537}
4538
Antoine Pitrou244651a2009-05-04 18:56:13 +00004539/* The decoder. The only state we preserve is our read position,
4540 * i.e. how many characters we have consumed. So if we end in the
4541 * middle of a shift sequence we have to back off the read position
4542 * and the output to the beginning of the sequence, otherwise we lose
4543 * all the shift state (seen bits, number of bits seen, high
4544 * surrogate). */
4545
Alexander Belopolsky40018472011-02-26 01:02:56 +00004546PyObject *
4547PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004548 Py_ssize_t size,
4549 const char *errors,
4550 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004551{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004553 Py_ssize_t startinpos;
4554 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004555 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004556 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004557 const char *errmsg = "";
4558 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004559 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004560 unsigned int base64bits = 0;
4561 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004562 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004563 PyObject *errorHandler = NULL;
4564 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004565
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004566 if (size == 0) {
4567 if (consumed)
4568 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004569 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004570 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004571
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004572 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004573 _PyUnicodeWriter_Init(&writer);
4574 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004575
4576 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004577 e = s + size;
4578
4579 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004580 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004581 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004582 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004583
Antoine Pitrou244651a2009-05-04 18:56:13 +00004584 if (inShift) { /* in a base-64 section */
4585 if (IS_BASE64(ch)) { /* consume a base-64 character */
4586 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4587 base64bits += 6;
4588 s++;
4589 if (base64bits >= 16) {
4590 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004591 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004592 base64bits -= 16;
4593 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004594 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004595 if (surrogate) {
4596 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004597 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4598 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004599 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004600 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004601 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004602 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004603 }
4604 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004605 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004606 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004607 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004608 }
4609 }
Victor Stinner551ac952011-11-29 22:58:13 +01004610 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004611 /* first surrogate */
4612 surrogate = outCh;
4613 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004614 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004615 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004616 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004617 }
4618 }
4619 }
4620 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004621 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004622 if (base64bits > 0) { /* left-over bits */
4623 if (base64bits >= 6) {
4624 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004625 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004626 errmsg = "partial character in shift sequence";
4627 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004628 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004629 else {
4630 /* Some bits remain; they should be zero */
4631 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004632 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004633 errmsg = "non-zero padding bits in shift sequence";
4634 goto utf7Error;
4635 }
4636 }
4637 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004638 if (surrogate && DECODE_DIRECT(ch)) {
4639 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4640 goto onError;
4641 }
4642 surrogate = 0;
4643 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004644 /* '-' is absorbed; other terminating
4645 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004646 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004647 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004648 }
4649 }
4650 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004651 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004652 s++; /* consume '+' */
4653 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004654 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004655 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004656 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004657 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004658 else if (s < e && !IS_BASE64(*s)) {
4659 s++;
4660 errmsg = "ill-formed sequence";
4661 goto utf7Error;
4662 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004663 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004664 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004665 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004666 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004667 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004668 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004669 }
4670 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004671 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004672 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004673 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004674 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004675 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004676 else {
4677 startinpos = s-starts;
4678 s++;
4679 errmsg = "unexpected special character";
4680 goto utf7Error;
4681 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004682 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004683utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004684 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004685 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004686 errors, &errorHandler,
4687 "utf7", errmsg,
4688 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004689 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004690 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004691 }
4692
Antoine Pitrou244651a2009-05-04 18:56:13 +00004693 /* end of string */
4694
4695 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4696 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004697 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004698 if (surrogate ||
4699 (base64bits >= 6) ||
4700 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004701 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004702 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004703 errors, &errorHandler,
4704 "utf7", "unterminated shift sequence",
4705 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004706 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004707 goto onError;
4708 if (s < e)
4709 goto restart;
4710 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004711 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004712
4713 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004714 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004715 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004716 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004717 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004718 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004719 writer.kind, writer.data, shiftOutStart);
4720 Py_XDECREF(errorHandler);
4721 Py_XDECREF(exc);
4722 _PyUnicodeWriter_Dealloc(&writer);
4723 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004724 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004725 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004726 }
4727 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004728 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004729 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004730 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004731
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004732 Py_XDECREF(errorHandler);
4733 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004734 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004735
Benjamin Peterson29060642009-01-31 22:14:21 +00004736 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004737 Py_XDECREF(errorHandler);
4738 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004739 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004740 return NULL;
4741}
4742
4743
Alexander Belopolsky40018472011-02-26 01:02:56 +00004744PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004745_PyUnicode_EncodeUTF7(PyObject *str,
4746 int base64SetO,
4747 int base64WhiteSpace,
4748 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004749{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004750 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004751 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004752 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004753 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004754 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004755 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004756 unsigned int base64bits = 0;
4757 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004758 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004759 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004760
Benjamin Petersonbac79492012-01-14 13:34:47 -05004761 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004762 return NULL;
4763 kind = PyUnicode_KIND(str);
4764 data = PyUnicode_DATA(str);
4765 len = PyUnicode_GET_LENGTH(str);
4766
4767 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004768 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004769
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004770 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004771 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004772 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004773 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004774 if (v == NULL)
4775 return NULL;
4776
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004777 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004778 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004779 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004780
Antoine Pitrou244651a2009-05-04 18:56:13 +00004781 if (inShift) {
4782 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4783 /* shifting out */
4784 if (base64bits) { /* output remaining bits */
4785 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4786 base64buffer = 0;
4787 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004788 }
4789 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004790 /* Characters not in the BASE64 set implicitly unshift the sequence
4791 so no '-' is required, except if the character is itself a '-' */
4792 if (IS_BASE64(ch) || ch == '-') {
4793 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004794 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004795 *out++ = (char) ch;
4796 }
4797 else {
4798 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004799 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004800 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004801 else { /* not in a shift sequence */
4802 if (ch == '+') {
4803 *out++ = '+';
4804 *out++ = '-';
4805 }
4806 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4807 *out++ = (char) ch;
4808 }
4809 else {
4810 *out++ = '+';
4811 inShift = 1;
4812 goto encode_char;
4813 }
4814 }
4815 continue;
4816encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004817 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004818 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004819
Antoine Pitrou244651a2009-05-04 18:56:13 +00004820 /* code first surrogate */
4821 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004822 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004823 while (base64bits >= 6) {
4824 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4825 base64bits -= 6;
4826 }
4827 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004828 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004829 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004830 base64bits += 16;
4831 base64buffer = (base64buffer << 16) | ch;
4832 while (base64bits >= 6) {
4833 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4834 base64bits -= 6;
4835 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004836 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004837 if (base64bits)
4838 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4839 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004840 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004841 if (_PyBytes_Resize(&v, out - start) < 0)
4842 return NULL;
4843 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004844}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004845PyObject *
4846PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4847 Py_ssize_t size,
4848 int base64SetO,
4849 int base64WhiteSpace,
4850 const char *errors)
4851{
4852 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004853 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004854 if (tmp == NULL)
4855 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004856 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004857 base64WhiteSpace, errors);
4858 Py_DECREF(tmp);
4859 return result;
4860}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004861
Antoine Pitrou244651a2009-05-04 18:56:13 +00004862#undef IS_BASE64
4863#undef FROM_BASE64
4864#undef TO_BASE64
4865#undef DECODE_DIRECT
4866#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004867
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868/* --- UTF-8 Codec -------------------------------------------------------- */
4869
Alexander Belopolsky40018472011-02-26 01:02:56 +00004870PyObject *
4871PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004872 Py_ssize_t size,
4873 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874{
Walter Dörwald69652032004-09-07 20:24:22 +00004875 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4876}
4877
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004878#include "stringlib/asciilib.h"
4879#include "stringlib/codecs.h"
4880#include "stringlib/undef.h"
4881
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004882#include "stringlib/ucs1lib.h"
4883#include "stringlib/codecs.h"
4884#include "stringlib/undef.h"
4885
4886#include "stringlib/ucs2lib.h"
4887#include "stringlib/codecs.h"
4888#include "stringlib/undef.h"
4889
4890#include "stringlib/ucs4lib.h"
4891#include "stringlib/codecs.h"
4892#include "stringlib/undef.h"
4893
Antoine Pitrouab868312009-01-10 15:40:25 +00004894/* Mask to quickly check whether a C 'long' contains a
4895 non-ASCII, UTF8-encoded char. */
4896#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004897# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004898#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004899# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004900#else
4901# error C 'long' size should be either 4 or 8!
4902#endif
4903
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004904static Py_ssize_t
4905ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004906{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004907 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004908 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004909
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004910 /*
4911 * Issue #17237: m68k is a bit different from most architectures in
4912 * that objects do not use "natural alignment" - for example, int and
4913 * long are only aligned at 2-byte boundaries. Therefore the assert()
4914 * won't work; also, tests have shown that skipping the "optimised
4915 * version" will even speed up m68k.
4916 */
4917#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004918#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004919 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4920 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004921 /* Fast path, see in STRINGLIB(utf8_decode) for
4922 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004923 /* Help allocation */
4924 const char *_p = p;
4925 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004926 while (_p < aligned_end) {
4927 unsigned long value = *(const unsigned long *) _p;
4928 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004929 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004930 *((unsigned long *)q) = value;
4931 _p += SIZEOF_LONG;
4932 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004933 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004934 p = _p;
4935 while (p < end) {
4936 if ((unsigned char)*p & 0x80)
4937 break;
4938 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004940 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004941 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004942#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004943#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004944 while (p < end) {
4945 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4946 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004947 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004948 /* Help allocation */
4949 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004950 while (_p < aligned_end) {
Andy Lestere6be9b52020-02-11 20:28:35 -06004951 unsigned long value = *(const unsigned long *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004952 if (value & ASCII_CHAR_MASK)
4953 break;
4954 _p += SIZEOF_LONG;
4955 }
4956 p = _p;
4957 if (_p == end)
4958 break;
4959 }
4960 if ((unsigned char)*p & 0x80)
4961 break;
4962 ++p;
4963 }
4964 memcpy(dest, start, p - start);
4965 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004966}
Antoine Pitrouab868312009-01-10 15:40:25 +00004967
Victor Stinner709d23d2019-05-02 14:56:30 -04004968static PyObject *
4969unicode_decode_utf8(const char *s, Py_ssize_t size,
4970 _Py_error_handler error_handler, const char *errors,
4971 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004972{
Victor Stinner785938e2011-12-11 20:09:03 +01004973 if (size == 0) {
4974 if (consumed)
4975 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004976 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004977 }
4978
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004979 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4980 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004981 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004982 *consumed = 1;
4983 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004984 }
4985
Inada Naoki770847a2019-06-24 12:30:24 +09004986 const char *starts = s;
4987 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01004988
Inada Naoki770847a2019-06-24 12:30:24 +09004989 // fast path: try ASCII string.
4990 PyObject *u = PyUnicode_New(size, 127);
4991 if (u == NULL) {
4992 return NULL;
4993 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004994 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09004995 if (s == end) {
4996 return u;
4997 }
4998
4999 // Use _PyUnicodeWriter after fast path is failed.
5000 _PyUnicodeWriter writer;
5001 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5002 writer.pos = s - starts;
5003
5004 Py_ssize_t startinpos, endinpos;
5005 const char *errmsg = "";
5006 PyObject *error_handler_obj = NULL;
5007 PyObject *exc = NULL;
5008
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005009 while (s < end) {
5010 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005011 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005012
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005013 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005014 if (PyUnicode_IS_ASCII(writer.buffer))
5015 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005016 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005017 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005018 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005019 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005020 } else {
5021 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005022 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005023 }
5024
5025 switch (ch) {
5026 case 0:
5027 if (s == end || consumed)
5028 goto End;
5029 errmsg = "unexpected end of data";
5030 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005031 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005032 break;
5033 case 1:
5034 errmsg = "invalid start byte";
5035 startinpos = s - starts;
5036 endinpos = startinpos + 1;
5037 break;
5038 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005039 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5040 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5041 {
5042 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005043 goto End;
5044 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005045 /* fall through */
5046 case 3:
5047 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005048 errmsg = "invalid continuation byte";
5049 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005050 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005051 break;
5052 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005053 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005054 goto onError;
5055 continue;
5056 }
5057
Victor Stinner1d65d912015-10-05 13:43:50 +02005058 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005059 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005060
5061 switch (error_handler) {
5062 case _Py_ERROR_IGNORE:
5063 s += (endinpos - startinpos);
5064 break;
5065
5066 case _Py_ERROR_REPLACE:
5067 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5068 goto onError;
5069 s += (endinpos - startinpos);
5070 break;
5071
5072 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005073 {
5074 Py_ssize_t i;
5075
Victor Stinner1d65d912015-10-05 13:43:50 +02005076 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5077 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005078 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005079 ch = (Py_UCS4)(unsigned char)(starts[i]);
5080 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5081 ch + 0xdc00);
5082 writer.pos++;
5083 }
5084 s += (endinpos - startinpos);
5085 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005086 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005087
5088 default:
5089 if (unicode_decode_call_errorhandler_writer(
5090 errors, &error_handler_obj,
5091 "utf-8", errmsg,
5092 &starts, &end, &startinpos, &endinpos, &exc, &s,
5093 &writer))
5094 goto onError;
5095 }
Victor Stinner785938e2011-12-11 20:09:03 +01005096 }
5097
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005098End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005099 if (consumed)
5100 *consumed = s - starts;
5101
Victor Stinner1d65d912015-10-05 13:43:50 +02005102 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005103 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005104 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005105
5106onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005107 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005108 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005109 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005110 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005111}
5112
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005113
Victor Stinner709d23d2019-05-02 14:56:30 -04005114PyObject *
5115PyUnicode_DecodeUTF8Stateful(const char *s,
5116 Py_ssize_t size,
5117 const char *errors,
5118 Py_ssize_t *consumed)
5119{
5120 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5121}
5122
5123
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005124/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5125 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005126
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005127 On success, write a pointer to a newly allocated wide character string into
5128 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5129 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005130
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005131 On memory allocation failure, return -1.
5132
5133 On decoding error (if surrogateescape is zero), return -2. If wlen is
5134 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5135 is not NULL, write the decoding error message into *reason. */
5136int
5137_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005138 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005139{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005140 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005141 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005142 wchar_t *unicode;
5143 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005144
Victor Stinner3d4226a2018-08-29 22:21:32 +02005145 int surrogateescape = 0;
5146 int surrogatepass = 0;
5147 switch (errors)
5148 {
5149 case _Py_ERROR_STRICT:
5150 break;
5151 case _Py_ERROR_SURROGATEESCAPE:
5152 surrogateescape = 1;
5153 break;
5154 case _Py_ERROR_SURROGATEPASS:
5155 surrogatepass = 1;
5156 break;
5157 default:
5158 return -3;
5159 }
5160
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005161 /* Note: size will always be longer than the resulting Unicode
5162 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005163 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005164 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005165 }
5166
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005167 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005168 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005169 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005170 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005171
5172 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005173 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005174 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005175 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005176 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005177#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005178 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005179#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005180 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005181#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005182 if (ch > 0xFF) {
5183#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005184 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005185#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005186 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005187 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005188 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5189 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5190#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005191 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005192 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005193 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005194 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005195 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005196
5197 if (surrogateescape) {
5198 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5199 }
5200 else {
5201 /* Is it a valid three-byte code? */
5202 if (surrogatepass
5203 && (e - s) >= 3
5204 && (s[0] & 0xf0) == 0xe0
5205 && (s[1] & 0xc0) == 0x80
5206 && (s[2] & 0xc0) == 0x80)
5207 {
5208 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5209 s += 3;
5210 unicode[outpos++] = ch;
5211 }
5212 else {
5213 PyMem_RawFree(unicode );
5214 if (reason != NULL) {
5215 switch (ch) {
5216 case 0:
5217 *reason = "unexpected end of data";
5218 break;
5219 case 1:
5220 *reason = "invalid start byte";
5221 break;
5222 /* 2, 3, 4 */
5223 default:
5224 *reason = "invalid continuation byte";
5225 break;
5226 }
5227 }
5228 if (wlen != NULL) {
5229 *wlen = s - orig_s;
5230 }
5231 return -2;
5232 }
5233 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005234 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005235 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005236 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005237 if (wlen) {
5238 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005239 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005240 *wstr = unicode;
5241 return 0;
5242}
5243
Victor Stinner5f9cf232019-03-19 01:46:25 +01005244
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005245wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005246_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5247 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005248{
5249 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005250 int res = _Py_DecodeUTF8Ex(arg, arglen,
5251 &wstr, wlen,
5252 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005253 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005254 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5255 assert(res != -3);
5256 if (wlen) {
5257 *wlen = (size_t)res;
5258 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005259 return NULL;
5260 }
5261 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005262}
5263
Antoine Pitrouab868312009-01-10 15:40:25 +00005264
Victor Stinnere47e6982017-12-21 15:45:16 +01005265/* UTF-8 encoder using the surrogateescape error handler .
5266
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005267 On success, return 0 and write the newly allocated character string (use
5268 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005269
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005270 On encoding failure, return -2 and write the position of the invalid
5271 surrogate character into *error_pos (if error_pos is set) and the decoding
5272 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005273
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005274 On memory allocation failure, return -1. */
5275int
5276_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005277 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005278{
5279 const Py_ssize_t max_char_size = 4;
5280 Py_ssize_t len = wcslen(text);
5281
5282 assert(len >= 0);
5283
Victor Stinner3d4226a2018-08-29 22:21:32 +02005284 int surrogateescape = 0;
5285 int surrogatepass = 0;
5286 switch (errors)
5287 {
5288 case _Py_ERROR_STRICT:
5289 break;
5290 case _Py_ERROR_SURROGATEESCAPE:
5291 surrogateescape = 1;
5292 break;
5293 case _Py_ERROR_SURROGATEPASS:
5294 surrogatepass = 1;
5295 break;
5296 default:
5297 return -3;
5298 }
5299
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005300 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5301 return -1;
5302 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005303 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005304 if (raw_malloc) {
5305 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005306 }
5307 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005308 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005309 }
5310 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005311 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005312 }
5313
5314 char *p = bytes;
5315 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005316 for (i = 0; i < len; ) {
5317 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005318 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005319 i++;
5320#if Py_UNICODE_SIZE == 2
5321 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5322 && i < len
5323 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5324 {
5325 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5326 i++;
5327 }
5328#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005329
5330 if (ch < 0x80) {
5331 /* Encode ASCII */
5332 *p++ = (char) ch;
5333
5334 }
5335 else if (ch < 0x0800) {
5336 /* Encode Latin-1 */
5337 *p++ = (char)(0xc0 | (ch >> 6));
5338 *p++ = (char)(0x80 | (ch & 0x3f));
5339 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005340 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005341 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005342 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005343 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005344 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005345 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005346 if (reason != NULL) {
5347 *reason = "encoding error";
5348 }
5349 if (raw_malloc) {
5350 PyMem_RawFree(bytes);
5351 }
5352 else {
5353 PyMem_Free(bytes);
5354 }
5355 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005356 }
5357 *p++ = (char)(ch & 0xff);
5358 }
5359 else if (ch < 0x10000) {
5360 *p++ = (char)(0xe0 | (ch >> 12));
5361 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5362 *p++ = (char)(0x80 | (ch & 0x3f));
5363 }
5364 else { /* ch >= 0x10000 */
5365 assert(ch <= MAX_UNICODE);
5366 /* Encode UCS4 Unicode ordinals */
5367 *p++ = (char)(0xf0 | (ch >> 18));
5368 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5369 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5370 *p++ = (char)(0x80 | (ch & 0x3f));
5371 }
5372 }
5373 *p++ = '\0';
5374
5375 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005376 char *bytes2;
5377 if (raw_malloc) {
5378 bytes2 = PyMem_RawRealloc(bytes, final_size);
5379 }
5380 else {
5381 bytes2 = PyMem_Realloc(bytes, final_size);
5382 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005383 if (bytes2 == NULL) {
5384 if (error_pos != NULL) {
5385 *error_pos = (size_t)-1;
5386 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005387 if (raw_malloc) {
5388 PyMem_RawFree(bytes);
5389 }
5390 else {
5391 PyMem_Free(bytes);
5392 }
5393 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005394 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005395 *str = bytes2;
5396 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005397}
5398
5399
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005400/* Primary internal function which creates utf8 encoded bytes objects.
5401
5402 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005403 and allocate exactly as much space needed at the end. Else allocate the
5404 maximum possible needed (4 result bytes per Unicode character), and return
5405 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005406*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005407static PyObject *
5408unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5409 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005411 if (!PyUnicode_Check(unicode)) {
5412 PyErr_BadArgument();
5413 return NULL;
5414 }
5415
5416 if (PyUnicode_READY(unicode) == -1)
5417 return NULL;
5418
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005419 if (PyUnicode_UTF8(unicode))
5420 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5421 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005422
Inada Naoki02a4d572020-02-27 13:48:59 +09005423 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005424 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005425 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5426
5427 _PyBytesWriter writer;
5428 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005429
Benjamin Petersonead6b532011-12-20 17:23:42 -06005430 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005431 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005432 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005433 case PyUnicode_1BYTE_KIND:
5434 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5435 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005436 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5437 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005438 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005439 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5440 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005441 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005442 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5443 break;
Tim Peters602f7402002-04-27 18:03:26 +00005444 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005445
5446 if (end == NULL) {
5447 _PyBytesWriter_Dealloc(&writer);
5448 return NULL;
5449 }
5450 return _PyBytesWriter_Finish(&writer, end);
5451}
5452
5453static int
5454unicode_fill_utf8(PyObject *unicode)
5455{
5456 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5457 assert(!PyUnicode_IS_ASCII(unicode));
5458
5459 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005460 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005461 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5462
5463 _PyBytesWriter writer;
5464 char *end;
5465
5466 switch (kind) {
5467 default:
5468 Py_UNREACHABLE();
5469 case PyUnicode_1BYTE_KIND:
5470 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5471 _Py_ERROR_STRICT, NULL);
5472 break;
5473 case PyUnicode_2BYTE_KIND:
5474 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5475 _Py_ERROR_STRICT, NULL);
5476 break;
5477 case PyUnicode_4BYTE_KIND:
5478 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5479 _Py_ERROR_STRICT, NULL);
5480 break;
5481 }
5482 if (end == NULL) {
5483 _PyBytesWriter_Dealloc(&writer);
5484 return -1;
5485 }
5486
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005487 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005488 PyBytes_AS_STRING(writer.buffer);
5489 Py_ssize_t len = end - start;
5490
5491 char *cache = PyObject_MALLOC(len + 1);
5492 if (cache == NULL) {
5493 _PyBytesWriter_Dealloc(&writer);
5494 PyErr_NoMemory();
5495 return -1;
5496 }
5497 _PyUnicode_UTF8(unicode) = cache;
5498 _PyUnicode_UTF8_LENGTH(unicode) = len;
5499 memcpy(cache, start, len);
5500 cache[len] = '\0';
5501 _PyBytesWriter_Dealloc(&writer);
5502 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503}
5504
Alexander Belopolsky40018472011-02-26 01:02:56 +00005505PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005506_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5507{
5508 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5509}
5510
5511
5512PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005513PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5514 Py_ssize_t size,
5515 const char *errors)
5516{
5517 PyObject *v, *unicode;
5518
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005519 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005520 if (unicode == NULL)
5521 return NULL;
5522 v = _PyUnicode_AsUTF8String(unicode, errors);
5523 Py_DECREF(unicode);
5524 return v;
5525}
5526
5527PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005528PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005530 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531}
5532
Walter Dörwald41980ca2007-08-16 21:55:45 +00005533/* --- UTF-32 Codec ------------------------------------------------------- */
5534
5535PyObject *
5536PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005537 Py_ssize_t size,
5538 const char *errors,
5539 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005540{
5541 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5542}
5543
5544PyObject *
5545PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005546 Py_ssize_t size,
5547 const char *errors,
5548 int *byteorder,
5549 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005550{
5551 const char *starts = s;
5552 Py_ssize_t startinpos;
5553 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005554 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005555 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005556 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005557 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005558 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005559 PyObject *errorHandler = NULL;
5560 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005561
Andy Lestere6be9b52020-02-11 20:28:35 -06005562 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005563 e = q + size;
5564
5565 if (byteorder)
5566 bo = *byteorder;
5567
5568 /* Check for BOM marks (U+FEFF) in the input and adjust current
5569 byte order setting accordingly. In native mode, the leading BOM
5570 mark is skipped, in all other modes, it is copied to the output
5571 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005572 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005573 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005574 if (bom == 0x0000FEFF) {
5575 bo = -1;
5576 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005577 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005578 else if (bom == 0xFFFE0000) {
5579 bo = 1;
5580 q += 4;
5581 }
5582 if (byteorder)
5583 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005584 }
5585
Victor Stinnere64322e2012-10-30 23:12:47 +01005586 if (q == e) {
5587 if (consumed)
5588 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005589 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005590 }
5591
Victor Stinnere64322e2012-10-30 23:12:47 +01005592#ifdef WORDS_BIGENDIAN
5593 le = bo < 0;
5594#else
5595 le = bo <= 0;
5596#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005597 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005598
Victor Stinner8f674cc2013-04-17 23:02:17 +02005599 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005600 writer.min_length = (e - q + 3) / 4;
5601 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005602 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005603
Victor Stinnere64322e2012-10-30 23:12:47 +01005604 while (1) {
5605 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005606 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005607
Victor Stinnere64322e2012-10-30 23:12:47 +01005608 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005609 enum PyUnicode_Kind kind = writer.kind;
5610 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005611 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005612 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005613 if (le) {
5614 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005615 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005616 if (ch > maxch)
5617 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005618 if (kind != PyUnicode_1BYTE_KIND &&
5619 Py_UNICODE_IS_SURROGATE(ch))
5620 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005621 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005622 q += 4;
5623 } while (q <= last);
5624 }
5625 else {
5626 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005627 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005628 if (ch > maxch)
5629 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005630 if (kind != PyUnicode_1BYTE_KIND &&
5631 Py_UNICODE_IS_SURROGATE(ch))
5632 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005633 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005634 q += 4;
5635 } while (q <= last);
5636 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005637 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005638 }
5639
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005640 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005641 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005642 startinpos = ((const char *)q) - starts;
5643 endinpos = startinpos + 4;
5644 }
5645 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005646 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005647 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005648 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005649 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005650 startinpos = ((const char *)q) - starts;
5651 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005652 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005653 else {
5654 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005655 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005656 goto onError;
5657 q += 4;
5658 continue;
5659 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005660 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005661 startinpos = ((const char *)q) - starts;
5662 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005663 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005664
5665 /* The remaining input chars are ignored if the callback
5666 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005667 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005668 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005669 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005670 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005671 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005672 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005673 }
5674
Walter Dörwald41980ca2007-08-16 21:55:45 +00005675 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005677
Walter Dörwald41980ca2007-08-16 21:55:45 +00005678 Py_XDECREF(errorHandler);
5679 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005680 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005681
Benjamin Peterson29060642009-01-31 22:14:21 +00005682 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005683 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005684 Py_XDECREF(errorHandler);
5685 Py_XDECREF(exc);
5686 return NULL;
5687}
5688
5689PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005690_PyUnicode_EncodeUTF32(PyObject *str,
5691 const char *errors,
5692 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005693{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005694 enum PyUnicode_Kind kind;
5695 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005696 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005697 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005698 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005699#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005700 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005701#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005702 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005703#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005704 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005705 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005706 PyObject *errorHandler = NULL;
5707 PyObject *exc = NULL;
5708 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005709
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005710 if (!PyUnicode_Check(str)) {
5711 PyErr_BadArgument();
5712 return NULL;
5713 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005714 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005715 return NULL;
5716 kind = PyUnicode_KIND(str);
5717 data = PyUnicode_DATA(str);
5718 len = PyUnicode_GET_LENGTH(str);
5719
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005720 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005721 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005722 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005723 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005724 if (v == NULL)
5725 return NULL;
5726
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005727 /* output buffer is 4-bytes aligned */
5728 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005729 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005730 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005731 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005732 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005733 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005734
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005735 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005736 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005737 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005738 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005739 else
5740 encoding = "utf-32";
5741
5742 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005743 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5744 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005745 }
5746
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005747 pos = 0;
5748 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005749 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005750
5751 if (kind == PyUnicode_2BYTE_KIND) {
5752 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5753 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005754 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005755 else {
5756 assert(kind == PyUnicode_4BYTE_KIND);
5757 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5758 &out, native_ordering);
5759 }
5760 if (pos == len)
5761 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005762
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005763 rep = unicode_encode_call_errorhandler(
5764 errors, &errorHandler,
5765 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005766 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005767 if (!rep)
5768 goto error;
5769
5770 if (PyBytes_Check(rep)) {
5771 repsize = PyBytes_GET_SIZE(rep);
5772 if (repsize & 3) {
5773 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005774 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005775 "surrogates not allowed");
5776 goto error;
5777 }
5778 moreunits = repsize / 4;
5779 }
5780 else {
5781 assert(PyUnicode_Check(rep));
5782 if (PyUnicode_READY(rep) < 0)
5783 goto error;
5784 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5785 if (!PyUnicode_IS_ASCII(rep)) {
5786 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005787 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005788 "surrogates not allowed");
5789 goto error;
5790 }
5791 }
5792
5793 /* four bytes are reserved for each surrogate */
5794 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005795 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005796 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005797 /* integer overflow */
5798 PyErr_NoMemory();
5799 goto error;
5800 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005801 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005802 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005803 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005804 }
5805
5806 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005807 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005808 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005809 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005810 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005811 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5812 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005813 }
5814
5815 Py_CLEAR(rep);
5816 }
5817
5818 /* Cut back to size actually needed. This is necessary for, for example,
5819 encoding of a string containing isolated surrogates and the 'ignore'
5820 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005821 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005822 if (nsize != PyBytes_GET_SIZE(v))
5823 _PyBytes_Resize(&v, nsize);
5824 Py_XDECREF(errorHandler);
5825 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005826 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005827 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005828 error:
5829 Py_XDECREF(rep);
5830 Py_XDECREF(errorHandler);
5831 Py_XDECREF(exc);
5832 Py_XDECREF(v);
5833 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005834}
5835
Alexander Belopolsky40018472011-02-26 01:02:56 +00005836PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005837PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5838 Py_ssize_t size,
5839 const char *errors,
5840 int byteorder)
5841{
5842 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005843 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005844 if (tmp == NULL)
5845 return NULL;
5846 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5847 Py_DECREF(tmp);
5848 return result;
5849}
5850
5851PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005852PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005853{
Victor Stinnerb960b342011-11-20 19:12:52 +01005854 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005855}
5856
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857/* --- UTF-16 Codec ------------------------------------------------------- */
5858
Tim Peters772747b2001-08-09 22:21:55 +00005859PyObject *
5860PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005861 Py_ssize_t size,
5862 const char *errors,
5863 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864{
Walter Dörwald69652032004-09-07 20:24:22 +00005865 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5866}
5867
5868PyObject *
5869PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005870 Py_ssize_t size,
5871 const char *errors,
5872 int *byteorder,
5873 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005874{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005875 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005876 Py_ssize_t startinpos;
5877 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005878 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005879 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005880 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005881 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005882 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005883 PyObject *errorHandler = NULL;
5884 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005885 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886
Andy Lestere6be9b52020-02-11 20:28:35 -06005887 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005888 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889
5890 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005891 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005893 /* Check for BOM marks (U+FEFF) in the input and adjust current
5894 byte order setting accordingly. In native mode, the leading BOM
5895 mark is skipped, in all other modes, it is copied to the output
5896 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005897 if (bo == 0 && size >= 2) {
5898 const Py_UCS4 bom = (q[1] << 8) | q[0];
5899 if (bom == 0xFEFF) {
5900 q += 2;
5901 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005902 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005903 else if (bom == 0xFFFE) {
5904 q += 2;
5905 bo = 1;
5906 }
5907 if (byteorder)
5908 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005909 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910
Antoine Pitrou63065d72012-05-15 23:48:04 +02005911 if (q == e) {
5912 if (consumed)
5913 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005914 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005915 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005916
Christian Heimes743e0cd2012-10-17 23:52:17 +02005917#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005918 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005919 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005920#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005921 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005922 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005923#endif
Tim Peters772747b2001-08-09 22:21:55 +00005924
Antoine Pitrou63065d72012-05-15 23:48:04 +02005925 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005926 character count normally. Error handler will take care of
5927 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005928 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005929 writer.min_length = (e - q + 1) / 2;
5930 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005931 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005932
Antoine Pitrou63065d72012-05-15 23:48:04 +02005933 while (1) {
5934 Py_UCS4 ch = 0;
5935 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005936 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005937 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005938 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005939 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005940 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005941 native_ordering);
5942 else
5943 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005944 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005945 native_ordering);
5946 } else if (kind == PyUnicode_2BYTE_KIND) {
5947 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005948 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005949 native_ordering);
5950 } else {
5951 assert(kind == PyUnicode_4BYTE_KIND);
5952 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005953 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005954 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005955 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005956 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005957
Antoine Pitrou63065d72012-05-15 23:48:04 +02005958 switch (ch)
5959 {
5960 case 0:
5961 /* remaining byte at the end? (size should be even) */
5962 if (q == e || consumed)
5963 goto End;
5964 errmsg = "truncated data";
5965 startinpos = ((const char *)q) - starts;
5966 endinpos = ((const char *)e) - starts;
5967 break;
5968 /* The remaining input chars are ignored if the callback
5969 chooses to skip the input */
5970 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005971 q -= 2;
5972 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005973 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005974 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005975 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005976 endinpos = ((const char *)e) - starts;
5977 break;
5978 case 2:
5979 errmsg = "illegal encoding";
5980 startinpos = ((const char *)q) - 2 - starts;
5981 endinpos = startinpos + 2;
5982 break;
5983 case 3:
5984 errmsg = "illegal UTF-16 surrogate";
5985 startinpos = ((const char *)q) - 4 - starts;
5986 endinpos = startinpos + 2;
5987 break;
5988 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005989 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005990 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 continue;
5992 }
5993
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005994 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005995 errors,
5996 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005997 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005998 &starts,
5999 (const char **)&e,
6000 &startinpos,
6001 &endinpos,
6002 &exc,
6003 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006004 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006005 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006 }
6007
Antoine Pitrou63065d72012-05-15 23:48:04 +02006008End:
Walter Dörwald69652032004-09-07 20:24:22 +00006009 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006010 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00006011
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006012 Py_XDECREF(errorHandler);
6013 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006014 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015
Benjamin Peterson29060642009-01-31 22:14:21 +00006016 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006017 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006018 Py_XDECREF(errorHandler);
6019 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020 return NULL;
6021}
6022
Tim Peters772747b2001-08-09 22:21:55 +00006023PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006024_PyUnicode_EncodeUTF16(PyObject *str,
6025 const char *errors,
6026 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006028 enum PyUnicode_Kind kind;
6029 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006030 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006031 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006032 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006033 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02006034#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006035 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006036#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006037 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006038#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006039 const char *encoding;
6040 Py_ssize_t nsize, pos;
6041 PyObject *errorHandler = NULL;
6042 PyObject *exc = NULL;
6043 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006044
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006045 if (!PyUnicode_Check(str)) {
6046 PyErr_BadArgument();
6047 return NULL;
6048 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006049 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006050 return NULL;
6051 kind = PyUnicode_KIND(str);
6052 data = PyUnicode_DATA(str);
6053 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006054
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006055 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006056 if (kind == PyUnicode_4BYTE_KIND) {
6057 const Py_UCS4 *in = (const Py_UCS4 *)data;
6058 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006059 while (in < end) {
6060 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006061 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006062 }
6063 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006064 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006065 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006066 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006067 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006068 nsize = len + pairs + (byteorder == 0);
6069 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006070 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006072 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006074 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006075 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006076 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006077 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006078 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006079 }
6080 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006081 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006082 }
Tim Peters772747b2001-08-09 22:21:55 +00006083
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006084 if (kind == PyUnicode_1BYTE_KIND) {
6085 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6086 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006087 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006088
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006089 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006090 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006091 }
6092 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006093 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006094 }
6095 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006096 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006097 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006098
6099 pos = 0;
6100 while (pos < len) {
6101 Py_ssize_t repsize, moreunits;
6102
6103 if (kind == PyUnicode_2BYTE_KIND) {
6104 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6105 &out, native_ordering);
6106 }
6107 else {
6108 assert(kind == PyUnicode_4BYTE_KIND);
6109 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6110 &out, native_ordering);
6111 }
6112 if (pos == len)
6113 break;
6114
6115 rep = unicode_encode_call_errorhandler(
6116 errors, &errorHandler,
6117 encoding, "surrogates not allowed",
6118 str, &exc, pos, pos + 1, &pos);
6119 if (!rep)
6120 goto error;
6121
6122 if (PyBytes_Check(rep)) {
6123 repsize = PyBytes_GET_SIZE(rep);
6124 if (repsize & 1) {
6125 raise_encode_exception(&exc, encoding,
6126 str, pos - 1, pos,
6127 "surrogates not allowed");
6128 goto error;
6129 }
6130 moreunits = repsize / 2;
6131 }
6132 else {
6133 assert(PyUnicode_Check(rep));
6134 if (PyUnicode_READY(rep) < 0)
6135 goto error;
6136 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6137 if (!PyUnicode_IS_ASCII(rep)) {
6138 raise_encode_exception(&exc, encoding,
6139 str, pos - 1, pos,
6140 "surrogates not allowed");
6141 goto error;
6142 }
6143 }
6144
6145 /* two bytes are reserved for each surrogate */
6146 if (moreunits > 1) {
6147 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006148 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006149 /* integer overflow */
6150 PyErr_NoMemory();
6151 goto error;
6152 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006153 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006154 goto error;
6155 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6156 }
6157
6158 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006159 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006160 out += moreunits;
6161 } else /* rep is unicode */ {
6162 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6163 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6164 &out, native_ordering);
6165 }
6166
6167 Py_CLEAR(rep);
6168 }
6169
6170 /* Cut back to size actually needed. This is necessary for, for example,
6171 encoding of a string containing isolated surrogates and the 'ignore' handler
6172 is used. */
6173 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6174 if (nsize != PyBytes_GET_SIZE(v))
6175 _PyBytes_Resize(&v, nsize);
6176 Py_XDECREF(errorHandler);
6177 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006178 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006179 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006180 error:
6181 Py_XDECREF(rep);
6182 Py_XDECREF(errorHandler);
6183 Py_XDECREF(exc);
6184 Py_XDECREF(v);
6185 return NULL;
6186#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187}
6188
Alexander Belopolsky40018472011-02-26 01:02:56 +00006189PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006190PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6191 Py_ssize_t size,
6192 const char *errors,
6193 int byteorder)
6194{
6195 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006196 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006197 if (tmp == NULL)
6198 return NULL;
6199 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6200 Py_DECREF(tmp);
6201 return result;
6202}
6203
6204PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006205PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006207 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208}
6209
6210/* --- Unicode Escape Codec ----------------------------------------------- */
6211
Fredrik Lundh06d12682001-01-24 07:59:11 +00006212static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006213
Alexander Belopolsky40018472011-02-26 01:02:56 +00006214PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006215_PyUnicode_DecodeUnicodeEscape(const char *s,
6216 Py_ssize_t size,
6217 const char *errors,
6218 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006220 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006221 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006223 PyObject *errorHandler = NULL;
6224 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006225
Eric V. Smith42454af2016-10-31 09:22:08 -04006226 // so we can remember if we've seen an invalid escape char or not
6227 *first_invalid_escape = NULL;
6228
Victor Stinner62ec3312016-09-06 17:04:34 -07006229 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006230 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006231 }
6232 /* Escaped strings will always be longer than the resulting
6233 Unicode string, so we start with size here and then reduce the
6234 length after conversion to the true value.
6235 (but if the error callback returns a long replacement string
6236 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006237 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006238 writer.min_length = size;
6239 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6240 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006241 }
6242
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243 end = s + size;
6244 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006245 unsigned char c = (unsigned char) *s++;
6246 Py_UCS4 ch;
6247 int count;
6248 Py_ssize_t startinpos;
6249 Py_ssize_t endinpos;
6250 const char *message;
6251
6252#define WRITE_ASCII_CHAR(ch) \
6253 do { \
6254 assert(ch <= 127); \
6255 assert(writer.pos < writer.size); \
6256 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6257 } while(0)
6258
6259#define WRITE_CHAR(ch) \
6260 do { \
6261 if (ch <= writer.maxchar) { \
6262 assert(writer.pos < writer.size); \
6263 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6264 } \
6265 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6266 goto onError; \
6267 } \
6268 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269
6270 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006271 if (c != '\\') {
6272 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273 continue;
6274 }
6275
Victor Stinner62ec3312016-09-06 17:04:34 -07006276 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006278 if (s >= end) {
6279 message = "\\ at end of string";
6280 goto error;
6281 }
6282 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006283
Victor Stinner62ec3312016-09-06 17:04:34 -07006284 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006285 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286
Benjamin Peterson29060642009-01-31 22:14:21 +00006287 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006288 case '\n': continue;
6289 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6290 case '\'': WRITE_ASCII_CHAR('\''); continue;
6291 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6292 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006293 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006294 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6295 case 't': WRITE_ASCII_CHAR('\t'); continue;
6296 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6297 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006298 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006299 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006300 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006301 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302
Benjamin Peterson29060642009-01-31 22:14:21 +00006303 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304 case '0': case '1': case '2': case '3':
6305 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006306 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006307 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006308 ch = (ch<<3) + *s++ - '0';
6309 if (s < end && '0' <= *s && *s <= '7') {
6310 ch = (ch<<3) + *s++ - '0';
6311 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006313 WRITE_CHAR(ch);
6314 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315
Benjamin Peterson29060642009-01-31 22:14:21 +00006316 /* hex escapes */
6317 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006319 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006320 message = "truncated \\xXX escape";
6321 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322
Benjamin Peterson29060642009-01-31 22:14:21 +00006323 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006325 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006326 message = "truncated \\uXXXX escape";
6327 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328
Benjamin Peterson29060642009-01-31 22:14:21 +00006329 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006330 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006331 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006332 message = "truncated \\UXXXXXXXX escape";
6333 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006334 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006335 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006336 ch <<= 4;
6337 if (c >= '0' && c <= '9') {
6338 ch += c - '0';
6339 }
6340 else if (c >= 'a' && c <= 'f') {
6341 ch += c - ('a' - 10);
6342 }
6343 else if (c >= 'A' && c <= 'F') {
6344 ch += c - ('A' - 10);
6345 }
6346 else {
6347 break;
6348 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006349 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006350 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006351 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006352 }
6353
6354 /* when we get here, ch is a 32-bit unicode character */
6355 if (ch > MAX_UNICODE) {
6356 message = "illegal Unicode character";
6357 goto error;
6358 }
6359
6360 WRITE_CHAR(ch);
6361 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006362
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006364 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006365 if (ucnhash_CAPI == NULL) {
6366 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006367 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6368 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006369 if (ucnhash_CAPI == NULL) {
6370 PyErr_SetString(
6371 PyExc_UnicodeError,
6372 "\\N escapes not supported (can't load unicodedata module)"
6373 );
6374 goto onError;
6375 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006376 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006377
6378 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006379 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006380 const char *start = ++s;
6381 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006382 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006383 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006384 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006385 namelen = s - start;
6386 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006387 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006388 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006389 ch = 0xffffffff; /* in case 'getcode' messes up */
6390 if (namelen <= INT_MAX &&
6391 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6392 &ch, 0)) {
6393 assert(ch <= MAX_UNICODE);
6394 WRITE_CHAR(ch);
6395 continue;
6396 }
6397 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006398 }
6399 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006400 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006401
6402 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006403 if (*first_invalid_escape == NULL) {
6404 *first_invalid_escape = s-1; /* Back up one char, since we've
6405 already incremented s. */
6406 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006407 WRITE_ASCII_CHAR('\\');
6408 WRITE_CHAR(c);
6409 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006411
6412 error:
6413 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006414 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006415 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006416 errors, &errorHandler,
6417 "unicodeescape", message,
6418 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006419 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006420 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006421 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006422 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006423
6424#undef WRITE_ASCII_CHAR
6425#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006427
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006428 Py_XDECREF(errorHandler);
6429 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006430 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006431
Benjamin Peterson29060642009-01-31 22:14:21 +00006432 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006433 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006434 Py_XDECREF(errorHandler);
6435 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436 return NULL;
6437}
6438
Eric V. Smith42454af2016-10-31 09:22:08 -04006439PyObject *
6440PyUnicode_DecodeUnicodeEscape(const char *s,
6441 Py_ssize_t size,
6442 const char *errors)
6443{
6444 const char *first_invalid_escape;
6445 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6446 &first_invalid_escape);
6447 if (result == NULL)
6448 return NULL;
6449 if (first_invalid_escape != NULL) {
6450 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6451 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006452 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006453 Py_DECREF(result);
6454 return NULL;
6455 }
6456 }
6457 return result;
6458}
6459
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006460/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461
Alexander Belopolsky40018472011-02-26 01:02:56 +00006462PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006463PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006465 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006466 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006468 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006469 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006470 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471
Ezio Melottie7f90372012-10-05 03:33:31 +03006472 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006473 escape.
6474
Ezio Melottie7f90372012-10-05 03:33:31 +03006475 For UCS1 strings it's '\xxx', 4 bytes per source character.
6476 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6477 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006478 */
6479
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006480 if (!PyUnicode_Check(unicode)) {
6481 PyErr_BadArgument();
6482 return NULL;
6483 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006484 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006485 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006486 }
Victor Stinner358af132015-10-12 22:36:57 +02006487
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006488 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006489 if (len == 0) {
6490 return PyBytes_FromStringAndSize(NULL, 0);
6491 }
6492
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006493 kind = PyUnicode_KIND(unicode);
6494 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006495 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6496 bytes, and 1 byte characters 4. */
6497 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006498 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006499 return PyErr_NoMemory();
6500 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006501 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006502 if (repr == NULL) {
6503 return NULL;
6504 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006505
Victor Stinner62ec3312016-09-06 17:04:34 -07006506 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006507 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006508 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006509
Victor Stinner62ec3312016-09-06 17:04:34 -07006510 /* U+0000-U+00ff range */
6511 if (ch < 0x100) {
6512 if (ch >= ' ' && ch < 127) {
6513 if (ch != '\\') {
6514 /* Copy printable US ASCII as-is */
6515 *p++ = (char) ch;
6516 }
6517 /* Escape backslashes */
6518 else {
6519 *p++ = '\\';
6520 *p++ = '\\';
6521 }
6522 }
Victor Stinner358af132015-10-12 22:36:57 +02006523
Victor Stinner62ec3312016-09-06 17:04:34 -07006524 /* Map special whitespace to '\t', \n', '\r' */
6525 else if (ch == '\t') {
6526 *p++ = '\\';
6527 *p++ = 't';
6528 }
6529 else if (ch == '\n') {
6530 *p++ = '\\';
6531 *p++ = 'n';
6532 }
6533 else if (ch == '\r') {
6534 *p++ = '\\';
6535 *p++ = 'r';
6536 }
6537
6538 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6539 else {
6540 *p++ = '\\';
6541 *p++ = 'x';
6542 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6543 *p++ = Py_hexdigits[ch & 0x000F];
6544 }
Tim Petersced69f82003-09-16 20:30:58 +00006545 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006546 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006547 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548 *p++ = '\\';
6549 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006550 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6551 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6552 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6553 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006555 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6556 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006557
Victor Stinner62ec3312016-09-06 17:04:34 -07006558 /* Make sure that the first two digits are zero */
6559 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006560 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006561 *p++ = 'U';
6562 *p++ = '0';
6563 *p++ = '0';
6564 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6565 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6566 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6567 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6568 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6569 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006570 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572
Victor Stinner62ec3312016-09-06 17:04:34 -07006573 assert(p - PyBytes_AS_STRING(repr) > 0);
6574 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6575 return NULL;
6576 }
6577 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578}
6579
Alexander Belopolsky40018472011-02-26 01:02:56 +00006580PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006581PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6582 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006584 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006585 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006586 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006588 }
6589
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006590 result = PyUnicode_AsUnicodeEscapeString(tmp);
6591 Py_DECREF(tmp);
6592 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593}
6594
6595/* --- Raw Unicode Escape Codec ------------------------------------------- */
6596
Alexander Belopolsky40018472011-02-26 01:02:56 +00006597PyObject *
6598PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006599 Py_ssize_t size,
6600 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006602 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006603 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006605 PyObject *errorHandler = NULL;
6606 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006607
Victor Stinner62ec3312016-09-06 17:04:34 -07006608 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006609 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006610 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006611
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 /* Escaped strings will always be longer than the resulting
6613 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006614 length after conversion to the true value. (But decoding error
6615 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006616 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006617 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006618 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6619 goto onError;
6620 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006621
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622 end = s + size;
6623 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006624 unsigned char c = (unsigned char) *s++;
6625 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006626 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006627 Py_ssize_t startinpos;
6628 Py_ssize_t endinpos;
6629 const char *message;
6630
6631#define WRITE_CHAR(ch) \
6632 do { \
6633 if (ch <= writer.maxchar) { \
6634 assert(writer.pos < writer.size); \
6635 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6636 } \
6637 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6638 goto onError; \
6639 } \
6640 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641
Benjamin Peterson29060642009-01-31 22:14:21 +00006642 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006643 if (c != '\\' || s >= end) {
6644 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006645 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006646 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006647
Victor Stinner62ec3312016-09-06 17:04:34 -07006648 c = (unsigned char) *s++;
6649 if (c == 'u') {
6650 count = 4;
6651 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006652 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006653 else if (c == 'U') {
6654 count = 8;
6655 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006656 }
6657 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006658 assert(writer.pos < writer.size);
6659 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6660 WRITE_CHAR(c);
6661 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006662 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006663 startinpos = s - starts - 2;
6664
6665 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6666 for (ch = 0; count && s < end; ++s, --count) {
6667 c = (unsigned char)*s;
6668 ch <<= 4;
6669 if (c >= '0' && c <= '9') {
6670 ch += c - '0';
6671 }
6672 else if (c >= 'a' && c <= 'f') {
6673 ch += c - ('a' - 10);
6674 }
6675 else if (c >= 'A' && c <= 'F') {
6676 ch += c - ('A' - 10);
6677 }
6678 else {
6679 break;
6680 }
6681 }
6682 if (!count) {
6683 if (ch <= MAX_UNICODE) {
6684 WRITE_CHAR(ch);
6685 continue;
6686 }
6687 message = "\\Uxxxxxxxx out of range";
6688 }
6689
6690 endinpos = s-starts;
6691 writer.min_length = end - s + writer.pos;
6692 if (unicode_decode_call_errorhandler_writer(
6693 errors, &errorHandler,
6694 "rawunicodeescape", message,
6695 &starts, &end, &startinpos, &endinpos, &exc, &s,
6696 &writer)) {
6697 goto onError;
6698 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006699 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006700
6701#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006703 Py_XDECREF(errorHandler);
6704 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006705 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006706
Benjamin Peterson29060642009-01-31 22:14:21 +00006707 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006708 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006709 Py_XDECREF(errorHandler);
6710 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006712
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713}
6714
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006715
Alexander Belopolsky40018472011-02-26 01:02:56 +00006716PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006717PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718{
Victor Stinner62ec3312016-09-06 17:04:34 -07006719 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006721 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006722 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006723 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006724 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006726 if (!PyUnicode_Check(unicode)) {
6727 PyErr_BadArgument();
6728 return NULL;
6729 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006730 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006731 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006732 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006733 kind = PyUnicode_KIND(unicode);
6734 data = PyUnicode_DATA(unicode);
6735 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006736 if (kind == PyUnicode_1BYTE_KIND) {
6737 return PyBytes_FromStringAndSize(data, len);
6738 }
Victor Stinner0e368262011-11-10 20:12:49 +01006739
Victor Stinner62ec3312016-09-06 17:04:34 -07006740 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6741 bytes, and 1 byte characters 4. */
6742 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006743
Victor Stinner62ec3312016-09-06 17:04:34 -07006744 if (len > PY_SSIZE_T_MAX / expandsize) {
6745 return PyErr_NoMemory();
6746 }
6747 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6748 if (repr == NULL) {
6749 return NULL;
6750 }
6751 if (len == 0) {
6752 return repr;
6753 }
6754
6755 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006756 for (pos = 0; pos < len; pos++) {
6757 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006758
Victor Stinner62ec3312016-09-06 17:04:34 -07006759 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6760 if (ch < 0x100) {
6761 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006762 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006763 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006764 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765 *p++ = '\\';
6766 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006767 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6768 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6769 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6770 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006772 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6773 else {
6774 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6775 *p++ = '\\';
6776 *p++ = 'U';
6777 *p++ = '0';
6778 *p++ = '0';
6779 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6780 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6781 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6782 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6783 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6784 *p++ = Py_hexdigits[ch & 15];
6785 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006787
Victor Stinner62ec3312016-09-06 17:04:34 -07006788 assert(p > PyBytes_AS_STRING(repr));
6789 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6790 return NULL;
6791 }
6792 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793}
6794
Alexander Belopolsky40018472011-02-26 01:02:56 +00006795PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006796PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6797 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006799 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006800 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006801 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006802 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006803 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6804 Py_DECREF(tmp);
6805 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806}
6807
6808/* --- Latin-1 Codec ------------------------------------------------------ */
6809
Alexander Belopolsky40018472011-02-26 01:02:56 +00006810PyObject *
6811PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006812 Py_ssize_t size,
6813 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006816 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817}
6818
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006819/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006820static void
6821make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006822 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006823 PyObject *unicode,
6824 Py_ssize_t startpos, Py_ssize_t endpos,
6825 const char *reason)
6826{
6827 if (*exceptionObject == NULL) {
6828 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006829 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006830 encoding, unicode, startpos, endpos, reason);
6831 }
6832 else {
6833 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6834 goto onError;
6835 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6836 goto onError;
6837 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6838 goto onError;
6839 return;
6840 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006841 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006842 }
6843}
6844
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006845/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006846static void
6847raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006848 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006849 PyObject *unicode,
6850 Py_ssize_t startpos, Py_ssize_t endpos,
6851 const char *reason)
6852{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006853 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006854 encoding, unicode, startpos, endpos, reason);
6855 if (*exceptionObject != NULL)
6856 PyCodec_StrictErrors(*exceptionObject);
6857}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006858
6859/* error handling callback helper:
6860 build arguments, call the callback and check the arguments,
6861 put the result into newpos and return the replacement string, which
6862 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006863static PyObject *
6864unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006865 PyObject **errorHandler,
6866 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006867 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006868 Py_ssize_t startpos, Py_ssize_t endpos,
6869 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006870{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006871 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006872 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006873 PyObject *restuple;
6874 PyObject *resunicode;
6875
6876 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006877 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006878 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006879 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006880 }
6881
Benjamin Petersonbac79492012-01-14 13:34:47 -05006882 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006883 return NULL;
6884 len = PyUnicode_GET_LENGTH(unicode);
6885
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006886 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006887 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006888 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006890
Petr Viktorinffd97532020-02-11 17:46:57 +01006891 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006892 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006893 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006894 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006895 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006896 Py_DECREF(restuple);
6897 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006898 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006899 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006900 &resunicode, newpos)) {
6901 Py_DECREF(restuple);
6902 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006903 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006904 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6905 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6906 Py_DECREF(restuple);
6907 return NULL;
6908 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006909 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006910 *newpos = len + *newpos;
6911 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006912 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006913 Py_DECREF(restuple);
6914 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006915 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006916 Py_INCREF(resunicode);
6917 Py_DECREF(restuple);
6918 return resunicode;
6919}
6920
Alexander Belopolsky40018472011-02-26 01:02:56 +00006921static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006922unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006923 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006924 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006925{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006926 /* input state */
6927 Py_ssize_t pos=0, size;
6928 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006929 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006930 /* pointer into the output */
6931 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006932 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6933 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006934 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006935 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006936 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006937 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006938 /* output object */
6939 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006940
Benjamin Petersonbac79492012-01-14 13:34:47 -05006941 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006942 return NULL;
6943 size = PyUnicode_GET_LENGTH(unicode);
6944 kind = PyUnicode_KIND(unicode);
6945 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006946 /* allocate enough for a simple encoding without
6947 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006948 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006949 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006950
6951 _PyBytesWriter_Init(&writer);
6952 str = _PyBytesWriter_Alloc(&writer, size);
6953 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006954 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006955
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006956 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006957 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006958
Benjamin Peterson29060642009-01-31 22:14:21 +00006959 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006960 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006961 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006962 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006963 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006964 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006966 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006967 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006968 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006969 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006970 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006971
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006972 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006974
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006975 /* Only overallocate the buffer if it's not the last write */
6976 writer.overallocate = (collend < size);
6977
Benjamin Peterson29060642009-01-31 22:14:21 +00006978 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006979 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006980 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006981
6982 switch (error_handler) {
6983 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006984 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006985 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006986
6987 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006988 memset(str, '?', collend - collstart);
6989 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006990 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006991 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006992 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006993 break;
Victor Stinner50149202015-09-22 00:26:54 +02006994
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006995 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006996 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006997 writer.min_size -= (collend - collstart);
6998 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006999 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007000 if (str == NULL)
7001 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007002 pos = collend;
7003 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007004
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007005 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007006 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007007 writer.min_size -= (collend - collstart);
7008 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007009 unicode, collstart, collend);
7010 if (str == NULL)
7011 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007012 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007013 break;
Victor Stinner50149202015-09-22 00:26:54 +02007014
Victor Stinnerc3713e92015-09-29 12:32:13 +02007015 case _Py_ERROR_SURROGATEESCAPE:
7016 for (i = collstart; i < collend; ++i) {
7017 ch = PyUnicode_READ(kind, data, i);
7018 if (ch < 0xdc80 || 0xdcff < ch) {
7019 /* Not a UTF-8b surrogate */
7020 break;
7021 }
7022 *str++ = (char)(ch - 0xdc00);
7023 ++pos;
7024 }
7025 if (i >= collend)
7026 break;
7027 collstart = pos;
7028 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02007029 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02007030
Benjamin Peterson29060642009-01-31 22:14:21 +00007031 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007032 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7033 encoding, reason, unicode, &exc,
7034 collstart, collend, &newpos);
7035 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007036 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007037
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007038 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007039 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007040
Victor Stinner6bd525b2015-10-09 13:10:05 +02007041 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007042 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007043 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007044 PyBytes_AS_STRING(rep),
7045 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007046 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007047 else {
7048 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007049
Victor Stinner6bd525b2015-10-09 13:10:05 +02007050 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007051 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007052
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007053 if (limit == 256 ?
7054 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7055 !PyUnicode_IS_ASCII(rep))
7056 {
7057 /* Not all characters are smaller than limit */
7058 raise_encode_exception(&exc, encoding, unicode,
7059 collstart, collend, reason);
7060 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007061 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007062 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7063 str = _PyBytesWriter_WriteBytes(&writer, str,
7064 PyUnicode_DATA(rep),
7065 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007066 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007067 if (str == NULL)
7068 goto onError;
7069
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007070 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007071 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007072 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007073
7074 /* If overallocation was disabled, ensure that it was the last
7075 write. Otherwise, we missed an optimization */
7076 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007077 }
7078 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007079
Victor Stinner50149202015-09-22 00:26:54 +02007080 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007081 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007082 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007083
7084 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007085 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007086 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007087 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007088 Py_XDECREF(exc);
7089 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007090}
7091
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007092/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007093PyObject *
7094PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007095 Py_ssize_t size,
7096 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007098 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007099 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007100 if (unicode == NULL)
7101 return NULL;
7102 result = unicode_encode_ucs1(unicode, errors, 256);
7103 Py_DECREF(unicode);
7104 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105}
7106
Alexander Belopolsky40018472011-02-26 01:02:56 +00007107PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007108_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109{
7110 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007111 PyErr_BadArgument();
7112 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007113 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007114 if (PyUnicode_READY(unicode) == -1)
7115 return NULL;
7116 /* Fast path: if it is a one-byte string, construct
7117 bytes object directly. */
7118 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7119 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7120 PyUnicode_GET_LENGTH(unicode));
7121 /* Non-Latin-1 characters present. Defer to above function to
7122 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007123 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007124}
7125
7126PyObject*
7127PyUnicode_AsLatin1String(PyObject *unicode)
7128{
7129 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130}
7131
7132/* --- 7-bit ASCII Codec -------------------------------------------------- */
7133
Alexander Belopolsky40018472011-02-26 01:02:56 +00007134PyObject *
7135PyUnicode_DecodeASCII(const char *s,
7136 Py_ssize_t size,
7137 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007139 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007140 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007141 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007142 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007143 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007144
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007146 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007147
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02007149 if (size == 1 && (unsigned char)s[0] < 128)
7150 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007151
Inada Naoki770847a2019-06-24 12:30:24 +09007152 // Shortcut for simple case
7153 PyObject *u = PyUnicode_New(size, 127);
7154 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007155 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007156 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007157 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007158 if (outpos == size) {
7159 return u;
7160 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007161
Inada Naoki770847a2019-06-24 12:30:24 +09007162 _PyUnicodeWriter writer;
7163 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007164 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007165
Inada Naoki770847a2019-06-24 12:30:24 +09007166 s += outpos;
7167 int kind = writer.kind;
7168 void *data = writer.data;
7169 Py_ssize_t startinpos, endinpos;
7170
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007171 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007172 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007173 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007174 PyUnicode_WRITE(kind, data, writer.pos, c);
7175 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007176 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007177 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007178 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007179
7180 /* byte outsize range 0x00..0x7f: call the error handler */
7181
7182 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007183 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007184
7185 switch (error_handler)
7186 {
7187 case _Py_ERROR_REPLACE:
7188 case _Py_ERROR_SURROGATEESCAPE:
7189 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007190 but we may switch to UCS2 at the first write */
7191 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7192 goto onError;
7193 kind = writer.kind;
7194 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007195
7196 if (error_handler == _Py_ERROR_REPLACE)
7197 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7198 else
7199 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7200 writer.pos++;
7201 ++s;
7202 break;
7203
7204 case _Py_ERROR_IGNORE:
7205 ++s;
7206 break;
7207
7208 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007209 startinpos = s-starts;
7210 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007211 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007212 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007213 "ascii", "ordinal not in range(128)",
7214 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007215 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007216 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007217 kind = writer.kind;
7218 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007219 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007221 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007222 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007223 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007224
Benjamin Peterson29060642009-01-31 22:14:21 +00007225 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007226 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007227 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007228 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229 return NULL;
7230}
7231
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007232/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007233PyObject *
7234PyUnicode_EncodeASCII(const Py_UNICODE *p,
7235 Py_ssize_t size,
7236 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007237{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007238 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007239 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007240 if (unicode == NULL)
7241 return NULL;
7242 result = unicode_encode_ucs1(unicode, errors, 128);
7243 Py_DECREF(unicode);
7244 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245}
7246
Alexander Belopolsky40018472011-02-26 01:02:56 +00007247PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007248_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249{
7250 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007251 PyErr_BadArgument();
7252 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007254 if (PyUnicode_READY(unicode) == -1)
7255 return NULL;
7256 /* Fast path: if it is an ASCII-only string, construct bytes object
7257 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007258 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007259 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7260 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007261 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007262}
7263
7264PyObject *
7265PyUnicode_AsASCIIString(PyObject *unicode)
7266{
7267 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268}
7269
Steve Dowercc16be82016-09-08 10:35:16 -07007270#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007271
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007272/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007273
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007274#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007275#define NEED_RETRY
7276#endif
7277
Steve Dower7ebdda02019-08-21 16:22:33 -07007278/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7279 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7280 both cases also and avoids partial characters overrunning the
7281 length limit in MultiByteToWideChar on Windows */
7282#define DECODING_CHUNK_SIZE (INT_MAX/4)
7283
Victor Stinner3a50e702011-10-18 21:21:00 +02007284#ifndef WC_ERR_INVALID_CHARS
7285# define WC_ERR_INVALID_CHARS 0x0080
7286#endif
7287
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007288static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007289code_page_name(UINT code_page, PyObject **obj)
7290{
7291 *obj = NULL;
7292 if (code_page == CP_ACP)
7293 return "mbcs";
7294 if (code_page == CP_UTF7)
7295 return "CP_UTF7";
7296 if (code_page == CP_UTF8)
7297 return "CP_UTF8";
7298
7299 *obj = PyBytes_FromFormat("cp%u", code_page);
7300 if (*obj == NULL)
7301 return NULL;
7302 return PyBytes_AS_STRING(*obj);
7303}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007304
Victor Stinner3a50e702011-10-18 21:21:00 +02007305static DWORD
7306decode_code_page_flags(UINT code_page)
7307{
7308 if (code_page == CP_UTF7) {
7309 /* The CP_UTF7 decoder only supports flags=0 */
7310 return 0;
7311 }
7312 else
7313 return MB_ERR_INVALID_CHARS;
7314}
7315
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007316/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007317 * Decode a byte string from a Windows code page into unicode object in strict
7318 * mode.
7319 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007320 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7321 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007322 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007323static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007324decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007325 wchar_t **buf,
7326 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007327 const char *in,
7328 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007329{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007330 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007331 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007332 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007333
7334 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007335 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007336 while ((outsize = MultiByteToWideChar(code_page, flags,
7337 in, insize, NULL, 0)) <= 0)
7338 {
7339 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7340 goto error;
7341 }
7342 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7343 flags = 0;
7344 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007345
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007346 /* Extend a wchar_t* buffer */
7347 Py_ssize_t n = *bufsize; /* Get the current length */
7348 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7349 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007350 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007351 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007352
7353 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007354 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7355 if (outsize <= 0)
7356 goto error;
7357 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007358
Victor Stinner3a50e702011-10-18 21:21:00 +02007359error:
7360 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7361 return -2;
7362 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007363 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007364}
7365
Victor Stinner3a50e702011-10-18 21:21:00 +02007366/*
7367 * Decode a byte string from a code page into unicode object with an error
7368 * handler.
7369 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007370 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007371 * UnicodeDecodeError exception and returns -1 on error.
7372 */
7373static int
7374decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007375 wchar_t **buf,
7376 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007377 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007378 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007379{
7380 const char *startin = in;
7381 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007382 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007383 /* Ideally, we should get reason from FormatMessage. This is the Windows
7384 2000 English version of the message. */
7385 const char *reason = "No mapping for the Unicode character exists "
7386 "in the target code page.";
7387 /* each step cannot decode more than 1 character, but a character can be
7388 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007389 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007390 int insize;
7391 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007392 PyObject *errorHandler = NULL;
7393 PyObject *exc = NULL;
7394 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007395 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007396 DWORD err;
7397 int ret = -1;
7398
7399 assert(size > 0);
7400
7401 encoding = code_page_name(code_page, &encoding_obj);
7402 if (encoding == NULL)
7403 return -1;
7404
Victor Stinner7d00cc12014-03-17 23:08:06 +01007405 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007406 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7407 UnicodeDecodeError. */
7408 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7409 if (exc != NULL) {
7410 PyCodec_StrictErrors(exc);
7411 Py_CLEAR(exc);
7412 }
7413 goto error;
7414 }
7415
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007416 /* Extend a wchar_t* buffer */
7417 Py_ssize_t n = *bufsize; /* Get the current length */
7418 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7419 PyErr_NoMemory();
7420 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007421 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007422 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7423 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007424 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007425 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007426
7427 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007428 while (in < endin)
7429 {
7430 /* Decode a character */
7431 insize = 1;
7432 do
7433 {
7434 outsize = MultiByteToWideChar(code_page, flags,
7435 in, insize,
7436 buffer, Py_ARRAY_LENGTH(buffer));
7437 if (outsize > 0)
7438 break;
7439 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007440 if (err == ERROR_INVALID_FLAGS && flags) {
7441 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7442 flags = 0;
7443 continue;
7444 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007445 if (err != ERROR_NO_UNICODE_TRANSLATION
7446 && err != ERROR_INSUFFICIENT_BUFFER)
7447 {
7448 PyErr_SetFromWindowsErr(0);
7449 goto error;
7450 }
7451 insize++;
7452 }
7453 /* 4=maximum length of a UTF-8 sequence */
7454 while (insize <= 4 && (in + insize) <= endin);
7455
7456 if (outsize <= 0) {
7457 Py_ssize_t startinpos, endinpos, outpos;
7458
Victor Stinner7d00cc12014-03-17 23:08:06 +01007459 /* last character in partial decode? */
7460 if (in + insize >= endin && !final)
7461 break;
7462
Victor Stinner3a50e702011-10-18 21:21:00 +02007463 startinpos = in - startin;
7464 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007465 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007466 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007467 errors, &errorHandler,
7468 encoding, reason,
7469 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007470 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007471 {
7472 goto error;
7473 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007474 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007475 }
7476 else {
7477 in += insize;
7478 memcpy(out, buffer, outsize * sizeof(wchar_t));
7479 out += outsize;
7480 }
7481 }
7482
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007483 /* Shrink the buffer */
7484 assert(out - *buf <= *bufsize);
7485 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007486 /* (in - startin) <= size and size is an int */
7487 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007488
7489error:
7490 Py_XDECREF(encoding_obj);
7491 Py_XDECREF(errorHandler);
7492 Py_XDECREF(exc);
7493 return ret;
7494}
7495
Victor Stinner3a50e702011-10-18 21:21:00 +02007496static PyObject *
7497decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007498 const char *s, Py_ssize_t size,
7499 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007500{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007501 wchar_t *buf = NULL;
7502 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007503 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007504
Victor Stinner3a50e702011-10-18 21:21:00 +02007505 if (code_page < 0) {
7506 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7507 return NULL;
7508 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007509 if (size < 0) {
7510 PyErr_BadInternalCall();
7511 return NULL;
7512 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007513
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007514 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007515 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007516
Victor Stinner76a31a62011-11-04 00:05:13 +01007517 do
7518 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007519#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007520 if (size > DECODING_CHUNK_SIZE) {
7521 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007522 final = 0;
7523 done = 0;
7524 }
7525 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007526#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007527 {
7528 chunk_size = (int)size;
7529 final = (consumed == NULL);
7530 done = 1;
7531 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007532
Victor Stinner76a31a62011-11-04 00:05:13 +01007533 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007534 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007535 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007536 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007537 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007538
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007539 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007540 s, chunk_size);
7541 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007542 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007543 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007544 errors, final);
7545 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007546
7547 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007548 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007549 return NULL;
7550 }
7551
7552 if (consumed)
7553 *consumed += converted;
7554
7555 s += converted;
7556 size -= converted;
7557 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007558
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007559 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7560 PyMem_Free(buf);
7561 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007562}
7563
Alexander Belopolsky40018472011-02-26 01:02:56 +00007564PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007565PyUnicode_DecodeCodePageStateful(int code_page,
7566 const char *s,
7567 Py_ssize_t size,
7568 const char *errors,
7569 Py_ssize_t *consumed)
7570{
7571 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7572}
7573
7574PyObject *
7575PyUnicode_DecodeMBCSStateful(const char *s,
7576 Py_ssize_t size,
7577 const char *errors,
7578 Py_ssize_t *consumed)
7579{
7580 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7581}
7582
7583PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007584PyUnicode_DecodeMBCS(const char *s,
7585 Py_ssize_t size,
7586 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007587{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007588 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7589}
7590
Victor Stinner3a50e702011-10-18 21:21:00 +02007591static DWORD
7592encode_code_page_flags(UINT code_page, const char *errors)
7593{
7594 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007595 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007596 }
7597 else if (code_page == CP_UTF7) {
7598 /* CP_UTF7 only supports flags=0 */
7599 return 0;
7600 }
7601 else {
7602 if (errors != NULL && strcmp(errors, "replace") == 0)
7603 return 0;
7604 else
7605 return WC_NO_BEST_FIT_CHARS;
7606 }
7607}
7608
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007609/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007610 * Encode a Unicode string to a Windows code page into a byte string in strict
7611 * mode.
7612 *
7613 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007614 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007615 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007616static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007617encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007618 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007619 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007620{
Victor Stinner554f3f02010-06-16 23:33:54 +00007621 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007622 BOOL *pusedDefaultChar = &usedDefaultChar;
7623 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007624 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007625 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007626 const DWORD flags = encode_code_page_flags(code_page, NULL);
7627 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007628 /* Create a substring so that we can get the UTF-16 representation
7629 of just the slice under consideration. */
7630 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007631
Martin v. Löwis3d325192011-11-04 18:23:06 +01007632 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007633
Victor Stinner3a50e702011-10-18 21:21:00 +02007634 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007635 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007636 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007637 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007638
Victor Stinner2fc507f2011-11-04 20:06:39 +01007639 substring = PyUnicode_Substring(unicode, offset, offset+len);
7640 if (substring == NULL)
7641 return -1;
7642 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7643 if (p == NULL) {
7644 Py_DECREF(substring);
7645 return -1;
7646 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007647 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007648
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007649 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007650 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007651 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007652 NULL, 0,
7653 NULL, pusedDefaultChar);
7654 if (outsize <= 0)
7655 goto error;
7656 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007657 if (pusedDefaultChar && *pusedDefaultChar) {
7658 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007659 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007660 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007661
Victor Stinner3a50e702011-10-18 21:21:00 +02007662 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007663 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007664 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007665 if (*outbytes == NULL) {
7666 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007667 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007668 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007669 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007670 }
7671 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007672 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007673 const Py_ssize_t n = PyBytes_Size(*outbytes);
7674 if (outsize > PY_SSIZE_T_MAX - n) {
7675 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007676 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007677 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007678 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007679 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7680 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007681 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007682 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007683 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007684 }
7685
7686 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007687 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007688 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007689 out, outsize,
7690 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007691 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007692 if (outsize <= 0)
7693 goto error;
7694 if (pusedDefaultChar && *pusedDefaultChar)
7695 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007696 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007697
Victor Stinner3a50e702011-10-18 21:21:00 +02007698error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007699 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007700 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7701 return -2;
7702 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007703 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007704}
7705
Victor Stinner3a50e702011-10-18 21:21:00 +02007706/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007707 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007708 * error handler.
7709 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007710 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007711 * -1 on other error.
7712 */
7713static int
7714encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007715 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007716 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007717{
Victor Stinner3a50e702011-10-18 21:21:00 +02007718 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007719 Py_ssize_t pos = unicode_offset;
7720 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007721 /* Ideally, we should get reason from FormatMessage. This is the Windows
7722 2000 English version of the message. */
7723 const char *reason = "invalid character";
7724 /* 4=maximum length of a UTF-8 sequence */
7725 char buffer[4];
7726 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7727 Py_ssize_t outsize;
7728 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007729 PyObject *errorHandler = NULL;
7730 PyObject *exc = NULL;
7731 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007732 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007733 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007734 PyObject *rep;
7735 int ret = -1;
7736
7737 assert(insize > 0);
7738
7739 encoding = code_page_name(code_page, &encoding_obj);
7740 if (encoding == NULL)
7741 return -1;
7742
7743 if (errors == NULL || strcmp(errors, "strict") == 0) {
7744 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7745 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007746 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007747 if (exc != NULL) {
7748 PyCodec_StrictErrors(exc);
7749 Py_DECREF(exc);
7750 }
7751 Py_XDECREF(encoding_obj);
7752 return -1;
7753 }
7754
7755 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7756 pusedDefaultChar = &usedDefaultChar;
7757 else
7758 pusedDefaultChar = NULL;
7759
7760 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7761 PyErr_NoMemory();
7762 goto error;
7763 }
7764 outsize = insize * Py_ARRAY_LENGTH(buffer);
7765
7766 if (*outbytes == NULL) {
7767 /* Create string object */
7768 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7769 if (*outbytes == NULL)
7770 goto error;
7771 out = PyBytes_AS_STRING(*outbytes);
7772 }
7773 else {
7774 /* Extend string object */
7775 Py_ssize_t n = PyBytes_Size(*outbytes);
7776 if (n > PY_SSIZE_T_MAX - outsize) {
7777 PyErr_NoMemory();
7778 goto error;
7779 }
7780 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7781 goto error;
7782 out = PyBytes_AS_STRING(*outbytes) + n;
7783 }
7784
7785 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007786 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007787 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007788 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7789 wchar_t chars[2];
7790 int charsize;
7791 if (ch < 0x10000) {
7792 chars[0] = (wchar_t)ch;
7793 charsize = 1;
7794 }
7795 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007796 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7797 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007798 charsize = 2;
7799 }
7800
Victor Stinner3a50e702011-10-18 21:21:00 +02007801 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007802 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007803 buffer, Py_ARRAY_LENGTH(buffer),
7804 NULL, pusedDefaultChar);
7805 if (outsize > 0) {
7806 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7807 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007808 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007809 memcpy(out, buffer, outsize);
7810 out += outsize;
7811 continue;
7812 }
7813 }
7814 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7815 PyErr_SetFromWindowsErr(0);
7816 goto error;
7817 }
7818
Victor Stinner3a50e702011-10-18 21:21:00 +02007819 rep = unicode_encode_call_errorhandler(
7820 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007821 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007822 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007823 if (rep == NULL)
7824 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007825 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007826
7827 if (PyBytes_Check(rep)) {
7828 outsize = PyBytes_GET_SIZE(rep);
7829 if (outsize != 1) {
7830 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7831 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7832 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7833 Py_DECREF(rep);
7834 goto error;
7835 }
7836 out = PyBytes_AS_STRING(*outbytes) + offset;
7837 }
7838 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7839 out += outsize;
7840 }
7841 else {
7842 Py_ssize_t i;
7843 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007844 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02007845
Benjamin Petersonbac79492012-01-14 13:34:47 -05007846 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007847 Py_DECREF(rep);
7848 goto error;
7849 }
7850
7851 outsize = PyUnicode_GET_LENGTH(rep);
7852 if (outsize != 1) {
7853 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7854 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7855 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7856 Py_DECREF(rep);
7857 goto error;
7858 }
7859 out = PyBytes_AS_STRING(*outbytes) + offset;
7860 }
7861 kind = PyUnicode_KIND(rep);
7862 data = PyUnicode_DATA(rep);
7863 for (i=0; i < outsize; i++) {
7864 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7865 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007866 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007867 encoding, unicode,
7868 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007869 "unable to encode error handler result to ASCII");
7870 Py_DECREF(rep);
7871 goto error;
7872 }
7873 *out = (unsigned char)ch;
7874 out++;
7875 }
7876 }
7877 Py_DECREF(rep);
7878 }
7879 /* write a NUL byte */
7880 *out = 0;
7881 outsize = out - PyBytes_AS_STRING(*outbytes);
7882 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7883 if (_PyBytes_Resize(outbytes, outsize) < 0)
7884 goto error;
7885 ret = 0;
7886
7887error:
7888 Py_XDECREF(encoding_obj);
7889 Py_XDECREF(errorHandler);
7890 Py_XDECREF(exc);
7891 return ret;
7892}
7893
Victor Stinner3a50e702011-10-18 21:21:00 +02007894static PyObject *
7895encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007896 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007897 const char *errors)
7898{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007899 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007900 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007901 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007902 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007903
Victor Stinner29dacf22015-01-26 16:41:32 +01007904 if (!PyUnicode_Check(unicode)) {
7905 PyErr_BadArgument();
7906 return NULL;
7907 }
7908
Benjamin Petersonbac79492012-01-14 13:34:47 -05007909 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007910 return NULL;
7911 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007912
Victor Stinner3a50e702011-10-18 21:21:00 +02007913 if (code_page < 0) {
7914 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7915 return NULL;
7916 }
7917
Martin v. Löwis3d325192011-11-04 18:23:06 +01007918 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007919 return PyBytes_FromStringAndSize(NULL, 0);
7920
Victor Stinner7581cef2011-11-03 22:32:33 +01007921 offset = 0;
7922 do
7923 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007924#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007925 if (len > DECODING_CHUNK_SIZE) {
7926 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007927 done = 0;
7928 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007929 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007930#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007931 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007932 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007933 done = 1;
7934 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007935
Victor Stinner76a31a62011-11-04 00:05:13 +01007936 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007937 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007938 errors);
7939 if (ret == -2)
7940 ret = encode_code_page_errors(code_page, &outbytes,
7941 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007942 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007943 if (ret < 0) {
7944 Py_XDECREF(outbytes);
7945 return NULL;
7946 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007947
Victor Stinner7581cef2011-11-03 22:32:33 +01007948 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007949 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007950 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007951
Victor Stinner3a50e702011-10-18 21:21:00 +02007952 return outbytes;
7953}
7954
7955PyObject *
7956PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7957 Py_ssize_t size,
7958 const char *errors)
7959{
Victor Stinner7581cef2011-11-03 22:32:33 +01007960 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007961 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007962 if (unicode == NULL)
7963 return NULL;
7964 res = encode_code_page(CP_ACP, unicode, errors);
7965 Py_DECREF(unicode);
7966 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007967}
7968
7969PyObject *
7970PyUnicode_EncodeCodePage(int code_page,
7971 PyObject *unicode,
7972 const char *errors)
7973{
Victor Stinner7581cef2011-11-03 22:32:33 +01007974 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007975}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007976
Alexander Belopolsky40018472011-02-26 01:02:56 +00007977PyObject *
7978PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007979{
Victor Stinner7581cef2011-11-03 22:32:33 +01007980 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007981}
7982
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007983#undef NEED_RETRY
7984
Steve Dowercc16be82016-09-08 10:35:16 -07007985#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007986
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987/* --- Character Mapping Codec -------------------------------------------- */
7988
Victor Stinnerfb161b12013-04-18 01:44:27 +02007989static int
7990charmap_decode_string(const char *s,
7991 Py_ssize_t size,
7992 PyObject *mapping,
7993 const char *errors,
7994 _PyUnicodeWriter *writer)
7995{
7996 const char *starts = s;
7997 const char *e;
7998 Py_ssize_t startinpos, endinpos;
7999 PyObject *errorHandler = NULL, *exc = NULL;
8000 Py_ssize_t maplen;
8001 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008002 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008003 Py_UCS4 x;
8004 unsigned char ch;
8005
8006 if (PyUnicode_READY(mapping) == -1)
8007 return -1;
8008
8009 maplen = PyUnicode_GET_LENGTH(mapping);
8010 mapdata = PyUnicode_DATA(mapping);
8011 mapkind = PyUnicode_KIND(mapping);
8012
8013 e = s + size;
8014
8015 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8016 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8017 * is disabled in encoding aliases, latin1 is preferred because
8018 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008019 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008020 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8021 Py_UCS4 maxchar = writer->maxchar;
8022
8023 assert (writer->kind == PyUnicode_1BYTE_KIND);
8024 while (s < e) {
8025 ch = *s;
8026 x = mapdata_ucs1[ch];
8027 if (x > maxchar) {
8028 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8029 goto onError;
8030 maxchar = writer->maxchar;
8031 outdata = (Py_UCS1 *)writer->data;
8032 }
8033 outdata[writer->pos] = x;
8034 writer->pos++;
8035 ++s;
8036 }
8037 return 0;
8038 }
8039
8040 while (s < e) {
8041 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8042 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008043 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008044 if (outkind == PyUnicode_1BYTE_KIND) {
8045 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8046 Py_UCS4 maxchar = writer->maxchar;
8047 while (s < e) {
8048 ch = *s;
8049 x = mapdata_ucs2[ch];
8050 if (x > maxchar)
8051 goto Error;
8052 outdata[writer->pos] = x;
8053 writer->pos++;
8054 ++s;
8055 }
8056 break;
8057 }
8058 else if (outkind == PyUnicode_2BYTE_KIND) {
8059 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8060 while (s < e) {
8061 ch = *s;
8062 x = mapdata_ucs2[ch];
8063 if (x == 0xFFFE)
8064 goto Error;
8065 outdata[writer->pos] = x;
8066 writer->pos++;
8067 ++s;
8068 }
8069 break;
8070 }
8071 }
8072 ch = *s;
8073
8074 if (ch < maplen)
8075 x = PyUnicode_READ(mapkind, mapdata, ch);
8076 else
8077 x = 0xfffe; /* invalid value */
8078Error:
8079 if (x == 0xfffe)
8080 {
8081 /* undefined mapping */
8082 startinpos = s-starts;
8083 endinpos = startinpos+1;
8084 if (unicode_decode_call_errorhandler_writer(
8085 errors, &errorHandler,
8086 "charmap", "character maps to <undefined>",
8087 &starts, &e, &startinpos, &endinpos, &exc, &s,
8088 writer)) {
8089 goto onError;
8090 }
8091 continue;
8092 }
8093
8094 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8095 goto onError;
8096 ++s;
8097 }
8098 Py_XDECREF(errorHandler);
8099 Py_XDECREF(exc);
8100 return 0;
8101
8102onError:
8103 Py_XDECREF(errorHandler);
8104 Py_XDECREF(exc);
8105 return -1;
8106}
8107
8108static int
8109charmap_decode_mapping(const char *s,
8110 Py_ssize_t size,
8111 PyObject *mapping,
8112 const char *errors,
8113 _PyUnicodeWriter *writer)
8114{
8115 const char *starts = s;
8116 const char *e;
8117 Py_ssize_t startinpos, endinpos;
8118 PyObject *errorHandler = NULL, *exc = NULL;
8119 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008120 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008121
8122 e = s + size;
8123
8124 while (s < e) {
8125 ch = *s;
8126
8127 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8128 key = PyLong_FromLong((long)ch);
8129 if (key == NULL)
8130 goto onError;
8131
8132 item = PyObject_GetItem(mapping, key);
8133 Py_DECREF(key);
8134 if (item == NULL) {
8135 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8136 /* No mapping found means: mapping is undefined. */
8137 PyErr_Clear();
8138 goto Undefined;
8139 } else
8140 goto onError;
8141 }
8142
8143 /* Apply mapping */
8144 if (item == Py_None)
8145 goto Undefined;
8146 if (PyLong_Check(item)) {
8147 long value = PyLong_AS_LONG(item);
8148 if (value == 0xFFFE)
8149 goto Undefined;
8150 if (value < 0 || value > MAX_UNICODE) {
8151 PyErr_Format(PyExc_TypeError,
8152 "character mapping must be in range(0x%lx)",
8153 (unsigned long)MAX_UNICODE + 1);
8154 goto onError;
8155 }
8156
8157 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8158 goto onError;
8159 }
8160 else if (PyUnicode_Check(item)) {
8161 if (PyUnicode_READY(item) == -1)
8162 goto onError;
8163 if (PyUnicode_GET_LENGTH(item) == 1) {
8164 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8165 if (value == 0xFFFE)
8166 goto Undefined;
8167 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8168 goto onError;
8169 }
8170 else {
8171 writer->overallocate = 1;
8172 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8173 goto onError;
8174 }
8175 }
8176 else {
8177 /* wrong return value */
8178 PyErr_SetString(PyExc_TypeError,
8179 "character mapping must return integer, None or str");
8180 goto onError;
8181 }
8182 Py_CLEAR(item);
8183 ++s;
8184 continue;
8185
8186Undefined:
8187 /* undefined mapping */
8188 Py_CLEAR(item);
8189 startinpos = s-starts;
8190 endinpos = startinpos+1;
8191 if (unicode_decode_call_errorhandler_writer(
8192 errors, &errorHandler,
8193 "charmap", "character maps to <undefined>",
8194 &starts, &e, &startinpos, &endinpos, &exc, &s,
8195 writer)) {
8196 goto onError;
8197 }
8198 }
8199 Py_XDECREF(errorHandler);
8200 Py_XDECREF(exc);
8201 return 0;
8202
8203onError:
8204 Py_XDECREF(item);
8205 Py_XDECREF(errorHandler);
8206 Py_XDECREF(exc);
8207 return -1;
8208}
8209
Alexander Belopolsky40018472011-02-26 01:02:56 +00008210PyObject *
8211PyUnicode_DecodeCharmap(const char *s,
8212 Py_ssize_t size,
8213 PyObject *mapping,
8214 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008215{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008216 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008217
Guido van Rossumd57fd912000-03-10 22:53:23 +00008218 /* Default to Latin-1 */
8219 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008220 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008221
Guido van Rossumd57fd912000-03-10 22:53:23 +00008222 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008223 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008224 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008225 writer.min_length = size;
8226 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008227 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008228
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008229 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008230 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8231 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008232 }
8233 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008234 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8235 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008237 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008238
Benjamin Peterson29060642009-01-31 22:14:21 +00008239 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008240 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008241 return NULL;
8242}
8243
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008244/* Charmap encoding: the lookup table */
8245
Alexander Belopolsky40018472011-02-26 01:02:56 +00008246struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008247 PyObject_HEAD
8248 unsigned char level1[32];
8249 int count2, count3;
8250 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008251};
8252
8253static PyObject*
8254encoding_map_size(PyObject *obj, PyObject* args)
8255{
8256 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008257 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008258 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008259}
8260
8261static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008262 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008263 PyDoc_STR("Return the size (in bytes) of this object") },
8264 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008265};
8266
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008267static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008268 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008269 "EncodingMap", /*tp_name*/
8270 sizeof(struct encoding_map), /*tp_basicsize*/
8271 0, /*tp_itemsize*/
8272 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008273 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008274 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008275 0, /*tp_getattr*/
8276 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008277 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008278 0, /*tp_repr*/
8279 0, /*tp_as_number*/
8280 0, /*tp_as_sequence*/
8281 0, /*tp_as_mapping*/
8282 0, /*tp_hash*/
8283 0, /*tp_call*/
8284 0, /*tp_str*/
8285 0, /*tp_getattro*/
8286 0, /*tp_setattro*/
8287 0, /*tp_as_buffer*/
8288 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8289 0, /*tp_doc*/
8290 0, /*tp_traverse*/
8291 0, /*tp_clear*/
8292 0, /*tp_richcompare*/
8293 0, /*tp_weaklistoffset*/
8294 0, /*tp_iter*/
8295 0, /*tp_iternext*/
8296 encoding_map_methods, /*tp_methods*/
8297 0, /*tp_members*/
8298 0, /*tp_getset*/
8299 0, /*tp_base*/
8300 0, /*tp_dict*/
8301 0, /*tp_descr_get*/
8302 0, /*tp_descr_set*/
8303 0, /*tp_dictoffset*/
8304 0, /*tp_init*/
8305 0, /*tp_alloc*/
8306 0, /*tp_new*/
8307 0, /*tp_free*/
8308 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008309};
8310
8311PyObject*
8312PyUnicode_BuildEncodingMap(PyObject* string)
8313{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008314 PyObject *result;
8315 struct encoding_map *mresult;
8316 int i;
8317 int need_dict = 0;
8318 unsigned char level1[32];
8319 unsigned char level2[512];
8320 unsigned char *mlevel1, *mlevel2, *mlevel3;
8321 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008322 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008323 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008324 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008325 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008326
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008327 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008328 PyErr_BadArgument();
8329 return NULL;
8330 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008331 kind = PyUnicode_KIND(string);
8332 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008333 length = PyUnicode_GET_LENGTH(string);
8334 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008335 memset(level1, 0xFF, sizeof level1);
8336 memset(level2, 0xFF, sizeof level2);
8337
8338 /* If there isn't a one-to-one mapping of NULL to \0,
8339 or if there are non-BMP characters, we need to use
8340 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008341 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008342 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008343 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008344 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008345 ch = PyUnicode_READ(kind, data, i);
8346 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008347 need_dict = 1;
8348 break;
8349 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008350 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008351 /* unmapped character */
8352 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008353 l1 = ch >> 11;
8354 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008355 if (level1[l1] == 0xFF)
8356 level1[l1] = count2++;
8357 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008358 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008359 }
8360
8361 if (count2 >= 0xFF || count3 >= 0xFF)
8362 need_dict = 1;
8363
8364 if (need_dict) {
8365 PyObject *result = PyDict_New();
8366 PyObject *key, *value;
8367 if (!result)
8368 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008369 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008370 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008371 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008372 if (!key || !value)
8373 goto failed1;
8374 if (PyDict_SetItem(result, key, value) == -1)
8375 goto failed1;
8376 Py_DECREF(key);
8377 Py_DECREF(value);
8378 }
8379 return result;
8380 failed1:
8381 Py_XDECREF(key);
8382 Py_XDECREF(value);
8383 Py_DECREF(result);
8384 return NULL;
8385 }
8386
8387 /* Create a three-level trie */
8388 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8389 16*count2 + 128*count3 - 1);
8390 if (!result)
8391 return PyErr_NoMemory();
8392 PyObject_Init(result, &EncodingMapType);
8393 mresult = (struct encoding_map*)result;
8394 mresult->count2 = count2;
8395 mresult->count3 = count3;
8396 mlevel1 = mresult->level1;
8397 mlevel2 = mresult->level23;
8398 mlevel3 = mresult->level23 + 16*count2;
8399 memcpy(mlevel1, level1, 32);
8400 memset(mlevel2, 0xFF, 16*count2);
8401 memset(mlevel3, 0, 128*count3);
8402 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008403 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008404 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008405 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8406 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008407 /* unmapped character */
8408 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008409 o1 = ch>>11;
8410 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008411 i2 = 16*mlevel1[o1] + o2;
8412 if (mlevel2[i2] == 0xFF)
8413 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008414 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008415 i3 = 128*mlevel2[i2] + o3;
8416 mlevel3[i3] = i;
8417 }
8418 return result;
8419}
8420
8421static int
Victor Stinner22168992011-11-20 17:09:18 +01008422encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008423{
8424 struct encoding_map *map = (struct encoding_map*)mapping;
8425 int l1 = c>>11;
8426 int l2 = (c>>7) & 0xF;
8427 int l3 = c & 0x7F;
8428 int i;
8429
Victor Stinner22168992011-11-20 17:09:18 +01008430 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008432 if (c == 0)
8433 return 0;
8434 /* level 1*/
8435 i = map->level1[l1];
8436 if (i == 0xFF) {
8437 return -1;
8438 }
8439 /* level 2*/
8440 i = map->level23[16*i+l2];
8441 if (i == 0xFF) {
8442 return -1;
8443 }
8444 /* level 3 */
8445 i = map->level23[16*map->count2 + 128*i + l3];
8446 if (i == 0) {
8447 return -1;
8448 }
8449 return i;
8450}
8451
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008452/* Lookup the character ch in the mapping. If the character
8453 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008454 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008455static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008456charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008457{
Christian Heimes217cfd12007-12-02 14:31:20 +00008458 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008459 PyObject *x;
8460
8461 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008463 x = PyObject_GetItem(mapping, w);
8464 Py_DECREF(w);
8465 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008466 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8467 /* No mapping found means: mapping is undefined. */
8468 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008469 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008470 } else
8471 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008472 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008473 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008475 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008476 long value = PyLong_AS_LONG(x);
8477 if (value < 0 || value > 255) {
8478 PyErr_SetString(PyExc_TypeError,
8479 "character mapping must be in range(256)");
8480 Py_DECREF(x);
8481 return NULL;
8482 }
8483 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008484 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008485 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008486 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008488 /* wrong return value */
8489 PyErr_Format(PyExc_TypeError,
8490 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008491 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 Py_DECREF(x);
8493 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008494 }
8495}
8496
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008497static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008498charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008499{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008500 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8501 /* exponentially overallocate to minimize reallocations */
8502 if (requiredsize < 2*outsize)
8503 requiredsize = 2*outsize;
8504 if (_PyBytes_Resize(outobj, requiredsize))
8505 return -1;
8506 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008507}
8508
Benjamin Peterson14339b62009-01-31 16:36:08 +00008509typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008511} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008512/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008513 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008514 space is available. Return a new reference to the object that
8515 was put in the output buffer, or Py_None, if the mapping was undefined
8516 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008517 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008518static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008519charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008520 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008521{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008522 PyObject *rep;
8523 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008524 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008525
Andy Lesterdffe4c02020-03-04 07:15:20 -06008526 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008527 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008528 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008529 if (res == -1)
8530 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008531 if (outsize<requiredsize)
8532 if (charmapencode_resize(outobj, outpos, requiredsize))
8533 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008534 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008535 outstart[(*outpos)++] = (char)res;
8536 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008537 }
8538
8539 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008540 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008542 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008543 Py_DECREF(rep);
8544 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008545 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008546 if (PyLong_Check(rep)) {
8547 Py_ssize_t requiredsize = *outpos+1;
8548 if (outsize<requiredsize)
8549 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8550 Py_DECREF(rep);
8551 return enc_EXCEPTION;
8552 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008553 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008554 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008555 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008556 else {
8557 const char *repchars = PyBytes_AS_STRING(rep);
8558 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8559 Py_ssize_t requiredsize = *outpos+repsize;
8560 if (outsize<requiredsize)
8561 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8562 Py_DECREF(rep);
8563 return enc_EXCEPTION;
8564 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008565 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 memcpy(outstart + *outpos, repchars, repsize);
8567 *outpos += repsize;
8568 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008569 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008570 Py_DECREF(rep);
8571 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008572}
8573
8574/* handle an error in PyUnicode_EncodeCharmap
8575 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008576static int
8577charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008578 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008579 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008580 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008581 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008582{
8583 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008584 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008585 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008586 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008587 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008588 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008589 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008590 Py_ssize_t collstartpos = *inpos;
8591 Py_ssize_t collendpos = *inpos+1;
8592 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008593 const char *encoding = "charmap";
8594 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008595 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008596 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008597 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008598
Benjamin Petersonbac79492012-01-14 13:34:47 -05008599 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008600 return -1;
8601 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008602 /* find all unencodable characters */
8603 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008604 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008605 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008606 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008607 val = encoding_map_lookup(ch, mapping);
8608 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008609 break;
8610 ++collendpos;
8611 continue;
8612 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008613
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008614 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8615 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008616 if (rep==NULL)
8617 return -1;
8618 else if (rep!=Py_None) {
8619 Py_DECREF(rep);
8620 break;
8621 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008622 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008623 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008624 }
8625 /* cache callback name lookup
8626 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008627 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008628 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008629
8630 switch (*error_handler) {
8631 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008632 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008633 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008634
8635 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008636 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008637 x = charmapencode_output('?', mapping, res, respos);
8638 if (x==enc_EXCEPTION) {
8639 return -1;
8640 }
8641 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008642 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008643 return -1;
8644 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008645 }
8646 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008647 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008648 *inpos = collendpos;
8649 break;
Victor Stinner50149202015-09-22 00:26:54 +02008650
8651 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008652 /* generate replacement (temporarily (mis)uses p) */
8653 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008654 char buffer[2+29+1+1];
8655 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008656 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008657 for (cp = buffer; *cp; ++cp) {
8658 x = charmapencode_output(*cp, mapping, res, respos);
8659 if (x==enc_EXCEPTION)
8660 return -1;
8661 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008662 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008663 return -1;
8664 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008665 }
8666 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008667 *inpos = collendpos;
8668 break;
Victor Stinner50149202015-09-22 00:26:54 +02008669
Benjamin Peterson14339b62009-01-31 16:36:08 +00008670 default:
Victor Stinner50149202015-09-22 00:26:54 +02008671 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008672 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008674 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008676 if (PyBytes_Check(repunicode)) {
8677 /* Directly copy bytes result to output. */
8678 Py_ssize_t outsize = PyBytes_Size(*res);
8679 Py_ssize_t requiredsize;
8680 repsize = PyBytes_Size(repunicode);
8681 requiredsize = *respos + repsize;
8682 if (requiredsize > outsize)
8683 /* Make room for all additional bytes. */
8684 if (charmapencode_resize(res, respos, requiredsize)) {
8685 Py_DECREF(repunicode);
8686 return -1;
8687 }
8688 memcpy(PyBytes_AsString(*res) + *respos,
8689 PyBytes_AsString(repunicode), repsize);
8690 *respos += repsize;
8691 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008692 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008693 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008694 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008695 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008696 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008697 Py_DECREF(repunicode);
8698 return -1;
8699 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008700 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008701 data = PyUnicode_DATA(repunicode);
8702 kind = PyUnicode_KIND(repunicode);
8703 for (index = 0; index < repsize; index++) {
8704 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8705 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008706 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008707 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008708 return -1;
8709 }
8710 else if (x==enc_FAILED) {
8711 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008712 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 return -1;
8714 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008715 }
8716 *inpos = newpos;
8717 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008718 }
8719 return 0;
8720}
8721
Alexander Belopolsky40018472011-02-26 01:02:56 +00008722PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008723_PyUnicode_EncodeCharmap(PyObject *unicode,
8724 PyObject *mapping,
8725 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008726{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008727 /* output object */
8728 PyObject *res = NULL;
8729 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008730 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008731 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008732 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008733 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008734 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008735 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008736 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008737 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008738 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008739
Benjamin Petersonbac79492012-01-14 13:34:47 -05008740 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008741 return NULL;
8742 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008743 data = PyUnicode_DATA(unicode);
8744 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008745
Guido van Rossumd57fd912000-03-10 22:53:23 +00008746 /* Default to Latin-1 */
8747 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008748 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008750 /* allocate enough for a simple encoding without
8751 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008752 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008753 if (res == NULL)
8754 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008755 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008756 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008757
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008758 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008759 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008760 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008761 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008762 if (x==enc_EXCEPTION) /* error */
8763 goto onError;
8764 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008765 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008766 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008767 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008768 &res, &respos)) {
8769 goto onError;
8770 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008771 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008772 else
8773 /* done with this character => adjust input position */
8774 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008775 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008776
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008777 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008778 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008779 if (_PyBytes_Resize(&res, respos) < 0)
8780 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008781
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008782 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008783 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008784 return res;
8785
Benjamin Peterson29060642009-01-31 22:14:21 +00008786 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008787 Py_XDECREF(res);
8788 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008789 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008790 return NULL;
8791}
8792
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008793/* Deprecated */
8794PyObject *
8795PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8796 Py_ssize_t size,
8797 PyObject *mapping,
8798 const char *errors)
8799{
8800 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008801 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008802 if (unicode == NULL)
8803 return NULL;
8804 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8805 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008806 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008807}
8808
Alexander Belopolsky40018472011-02-26 01:02:56 +00008809PyObject *
8810PyUnicode_AsCharmapString(PyObject *unicode,
8811 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008812{
8813 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008814 PyErr_BadArgument();
8815 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008816 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008817 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008818}
8819
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008820/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008821static void
8822make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008823 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008824 Py_ssize_t startpos, Py_ssize_t endpos,
8825 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008826{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008827 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008828 *exceptionObject = _PyUnicodeTranslateError_Create(
8829 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008830 }
8831 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008832 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8833 goto onError;
8834 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8835 goto onError;
8836 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8837 goto onError;
8838 return;
8839 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008840 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008841 }
8842}
8843
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008844/* error handling callback helper:
8845 build arguments, call the callback and check the arguments,
8846 put the result into newpos and return the replacement string, which
8847 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008848static PyObject *
8849unicode_translate_call_errorhandler(const char *errors,
8850 PyObject **errorHandler,
8851 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008852 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008853 Py_ssize_t startpos, Py_ssize_t endpos,
8854 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008855{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008856 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008857
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008858 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008859 PyObject *restuple;
8860 PyObject *resunicode;
8861
8862 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008863 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008864 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008865 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008866 }
8867
8868 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008869 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008870 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008871 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008872
Petr Viktorinffd97532020-02-11 17:46:57 +01008873 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008874 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008875 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008876 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008877 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008878 Py_DECREF(restuple);
8879 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008880 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008881 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008882 &resunicode, &i_newpos)) {
8883 Py_DECREF(restuple);
8884 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008885 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008886 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008887 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008888 else
8889 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008890 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008891 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008892 Py_DECREF(restuple);
8893 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008894 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008895 Py_INCREF(resunicode);
8896 Py_DECREF(restuple);
8897 return resunicode;
8898}
8899
8900/* Lookup the character ch in the mapping and put the result in result,
8901 which must be decrefed by the caller.
8902 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008903static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008904charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008905{
Christian Heimes217cfd12007-12-02 14:31:20 +00008906 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008907 PyObject *x;
8908
8909 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008910 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008911 x = PyObject_GetItem(mapping, w);
8912 Py_DECREF(w);
8913 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008914 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8915 /* No mapping found means: use 1:1 mapping. */
8916 PyErr_Clear();
8917 *result = NULL;
8918 return 0;
8919 } else
8920 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008921 }
8922 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008923 *result = x;
8924 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008925 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008926 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008927 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008928 if (value < 0 || value > MAX_UNICODE) {
8929 PyErr_Format(PyExc_ValueError,
8930 "character mapping must be in range(0x%x)",
8931 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008932 Py_DECREF(x);
8933 return -1;
8934 }
8935 *result = x;
8936 return 0;
8937 }
8938 else if (PyUnicode_Check(x)) {
8939 *result = x;
8940 return 0;
8941 }
8942 else {
8943 /* wrong return value */
8944 PyErr_SetString(PyExc_TypeError,
8945 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008946 Py_DECREF(x);
8947 return -1;
8948 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008949}
Victor Stinner1194ea02014-04-04 19:37:40 +02008950
8951/* lookup the character, write the result into the writer.
8952 Return 1 if the result was written into the writer, return 0 if the mapping
8953 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008954static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008955charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8956 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008957{
Victor Stinner1194ea02014-04-04 19:37:40 +02008958 PyObject *item;
8959
8960 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008961 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008962
8963 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008964 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008965 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008966 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008967 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008968 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008969 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008970
8971 if (item == Py_None) {
8972 Py_DECREF(item);
8973 return 0;
8974 }
8975
8976 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008977 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8978 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8979 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008980 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8981 Py_DECREF(item);
8982 return -1;
8983 }
8984 Py_DECREF(item);
8985 return 1;
8986 }
8987
8988 if (!PyUnicode_Check(item)) {
8989 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008990 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008991 }
8992
8993 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8994 Py_DECREF(item);
8995 return -1;
8996 }
8997
8998 Py_DECREF(item);
8999 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009000}
9001
Victor Stinner89a76ab2014-04-05 11:44:04 +02009002static int
9003unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9004 Py_UCS1 *translate)
9005{
Benjamin Peterson1365de72014-04-07 20:15:41 -04009006 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009007 int ret = 0;
9008
Victor Stinner89a76ab2014-04-05 11:44:04 +02009009 if (charmaptranslate_lookup(ch, mapping, &item)) {
9010 return -1;
9011 }
9012
9013 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009014 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02009015 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009016 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009017 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009018 /* not found => default to 1:1 mapping */
9019 translate[ch] = ch;
9020 return 1;
9021 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009022 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02009023 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009024 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9025 used it */
9026 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009027 /* invalid character or character outside ASCII:
9028 skip the fast translate */
9029 goto exit;
9030 }
9031 translate[ch] = (Py_UCS1)replace;
9032 }
9033 else if (PyUnicode_Check(item)) {
9034 Py_UCS4 replace;
9035
9036 if (PyUnicode_READY(item) == -1) {
9037 Py_DECREF(item);
9038 return -1;
9039 }
9040 if (PyUnicode_GET_LENGTH(item) != 1)
9041 goto exit;
9042
9043 replace = PyUnicode_READ_CHAR(item, 0);
9044 if (replace > 127)
9045 goto exit;
9046 translate[ch] = (Py_UCS1)replace;
9047 }
9048 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009049 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009050 goto exit;
9051 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009052 ret = 1;
9053
Benjamin Peterson1365de72014-04-07 20:15:41 -04009054 exit:
9055 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009056 return ret;
9057}
9058
9059/* Fast path for ascii => ascii translation. Return 1 if the whole string
9060 was translated into writer, return 0 if the input string was partially
9061 translated into writer, raise an exception and return -1 on error. */
9062static int
9063unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009064 _PyUnicodeWriter *writer, int ignore,
9065 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009066{
Victor Stinner872b2912014-04-05 14:27:07 +02009067 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009068 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009069 const Py_UCS1 *in, *end;
9070 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009071 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009072
Victor Stinner89a76ab2014-04-05 11:44:04 +02009073 len = PyUnicode_GET_LENGTH(input);
9074
Victor Stinner872b2912014-04-05 14:27:07 +02009075 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009076
9077 in = PyUnicode_1BYTE_DATA(input);
9078 end = in + len;
9079
9080 assert(PyUnicode_IS_ASCII(writer->buffer));
9081 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9082 out = PyUnicode_1BYTE_DATA(writer->buffer);
9083
Victor Stinner872b2912014-04-05 14:27:07 +02009084 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009085 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009086 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009087 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009088 int translate = unicode_fast_translate_lookup(mapping, ch,
9089 ascii_table);
9090 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009091 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009092 if (translate == 0)
9093 goto exit;
9094 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009095 }
Victor Stinner872b2912014-04-05 14:27:07 +02009096 if (ch2 == 0xfe) {
9097 if (ignore)
9098 continue;
9099 goto exit;
9100 }
9101 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009102 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009103 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009104 }
Victor Stinner872b2912014-04-05 14:27:07 +02009105 res = 1;
9106
9107exit:
9108 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009109 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009110 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009111}
9112
Victor Stinner3222da22015-10-01 22:07:32 +02009113static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009114_PyUnicode_TranslateCharmap(PyObject *input,
9115 PyObject *mapping,
9116 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009117{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009118 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009119 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009120 Py_ssize_t size, i;
9121 int kind;
9122 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009123 _PyUnicodeWriter writer;
9124 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009125 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009126 PyObject *errorHandler = NULL;
9127 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009128 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009129 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009130
Guido van Rossumd57fd912000-03-10 22:53:23 +00009131 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009132 PyErr_BadArgument();
9133 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009134 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009136 if (PyUnicode_READY(input) == -1)
9137 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009138 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009139 kind = PyUnicode_KIND(input);
9140 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009141
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009142 if (size == 0)
9143 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009144
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009145 /* allocate enough for a simple 1:1 translation without
9146 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009147 _PyUnicodeWriter_Init(&writer);
9148 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009149 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150
Victor Stinner872b2912014-04-05 14:27:07 +02009151 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9152
Victor Stinner33798672016-03-01 21:59:58 +01009153 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009154 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009155 if (PyUnicode_IS_ASCII(input)) {
9156 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9157 if (res < 0) {
9158 _PyUnicodeWriter_Dealloc(&writer);
9159 return NULL;
9160 }
9161 if (res == 1)
9162 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009163 }
Victor Stinner33798672016-03-01 21:59:58 +01009164 else {
9165 i = 0;
9166 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009168 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009169 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009170 int translate;
9171 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9172 Py_ssize_t newpos;
9173 /* startpos for collecting untranslatable chars */
9174 Py_ssize_t collstart;
9175 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009176 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009177
Victor Stinner1194ea02014-04-04 19:37:40 +02009178 ch = PyUnicode_READ(kind, data, i);
9179 translate = charmaptranslate_output(ch, mapping, &writer);
9180 if (translate < 0)
9181 goto onError;
9182
9183 if (translate != 0) {
9184 /* it worked => adjust input pointer */
9185 ++i;
9186 continue;
9187 }
9188
9189 /* untranslatable character */
9190 collstart = i;
9191 collend = i+1;
9192
9193 /* find all untranslatable characters */
9194 while (collend < size) {
9195 PyObject *x;
9196 ch = PyUnicode_READ(kind, data, collend);
9197 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009198 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009199 Py_XDECREF(x);
9200 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009201 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009202 ++collend;
9203 }
9204
9205 if (ignore) {
9206 i = collend;
9207 }
9208 else {
9209 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9210 reason, input, &exc,
9211 collstart, collend, &newpos);
9212 if (repunicode == NULL)
9213 goto onError;
9214 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009215 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009216 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009217 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009218 Py_DECREF(repunicode);
9219 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009220 }
9221 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009222 Py_XDECREF(exc);
9223 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009224 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009225
Benjamin Peterson29060642009-01-31 22:14:21 +00009226 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009227 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009228 Py_XDECREF(exc);
9229 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009230 return NULL;
9231}
9232
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009233/* Deprecated. Use PyUnicode_Translate instead. */
9234PyObject *
9235PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9236 Py_ssize_t size,
9237 PyObject *mapping,
9238 const char *errors)
9239{
Christian Heimes5f520f42012-09-11 14:03:25 +02009240 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009241 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242 if (!unicode)
9243 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009244 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9245 Py_DECREF(unicode);
9246 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009247}
9248
Alexander Belopolsky40018472011-02-26 01:02:56 +00009249PyObject *
9250PyUnicode_Translate(PyObject *str,
9251 PyObject *mapping,
9252 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009253{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009254 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009255 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009256 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009257}
Tim Petersced69f82003-09-16 20:30:58 +00009258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009259PyObject *
9260_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9261{
9262 if (!PyUnicode_Check(unicode)) {
9263 PyErr_BadInternalCall();
9264 return NULL;
9265 }
9266 if (PyUnicode_READY(unicode) == -1)
9267 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009268 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009269 /* If the string is already ASCII, just return the same string */
9270 Py_INCREF(unicode);
9271 return unicode;
9272 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009273
9274 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9275 PyObject *result = PyUnicode_New(len, 127);
9276 if (result == NULL) {
9277 return NULL;
9278 }
9279
9280 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9281 int kind = PyUnicode_KIND(unicode);
9282 const void *data = PyUnicode_DATA(unicode);
9283 Py_ssize_t i;
9284 for (i = 0; i < len; ++i) {
9285 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9286 if (ch < 127) {
9287 out[i] = ch;
9288 }
9289 else if (Py_UNICODE_ISSPACE(ch)) {
9290 out[i] = ' ';
9291 }
9292 else {
9293 int decimal = Py_UNICODE_TODECIMAL(ch);
9294 if (decimal < 0) {
9295 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009296 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009297 _PyUnicode_LENGTH(result) = i + 1;
9298 break;
9299 }
9300 out[i] = '0' + decimal;
9301 }
9302 }
9303
INADA Naoki16dfca42018-07-14 12:06:43 +09009304 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009305 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009306}
9307
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009308PyObject *
9309PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9310 Py_ssize_t length)
9311{
Victor Stinnerf0124502011-11-21 23:12:56 +01009312 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009313 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009314 Py_UCS4 maxchar;
9315 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009316 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009317
Victor Stinner99d7ad02012-02-22 13:37:39 +01009318 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009319 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009320 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009321 if (ch > 127) {
9322 int decimal = Py_UNICODE_TODECIMAL(ch);
9323 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009324 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009325 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009326 }
9327 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009328
9329 /* Copy to a new string */
9330 decimal = PyUnicode_New(length, maxchar);
9331 if (decimal == NULL)
9332 return decimal;
9333 kind = PyUnicode_KIND(decimal);
9334 data = PyUnicode_DATA(decimal);
9335 /* Iterate over code points */
9336 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009337 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009338 if (ch > 127) {
9339 int decimal = Py_UNICODE_TODECIMAL(ch);
9340 if (decimal >= 0)
9341 ch = '0' + decimal;
9342 }
9343 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009344 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009345 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009346}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009347/* --- Decimal Encoder ---------------------------------------------------- */
9348
Alexander Belopolsky40018472011-02-26 01:02:56 +00009349int
9350PyUnicode_EncodeDecimal(Py_UNICODE *s,
9351 Py_ssize_t length,
9352 char *output,
9353 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009354{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009355 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009356 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009357 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009358 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009359
9360 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009361 PyErr_BadArgument();
9362 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009363 }
9364
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009365 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009366 if (unicode == NULL)
9367 return -1;
9368
Victor Stinner42bf7752011-11-21 22:52:58 +01009369 kind = PyUnicode_KIND(unicode);
9370 data = PyUnicode_DATA(unicode);
9371
Victor Stinnerb84d7232011-11-22 01:50:07 +01009372 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009373 PyObject *exc;
9374 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009375 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009376 Py_ssize_t startpos;
9377
9378 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009379
Benjamin Peterson29060642009-01-31 22:14:21 +00009380 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009381 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009382 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009383 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009384 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009385 decimal = Py_UNICODE_TODECIMAL(ch);
9386 if (decimal >= 0) {
9387 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009388 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009389 continue;
9390 }
9391 if (0 < ch && ch < 256) {
9392 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009393 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009394 continue;
9395 }
Victor Stinner6345be92011-11-25 20:09:01 +01009396
Victor Stinner42bf7752011-11-21 22:52:58 +01009397 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009398 exc = NULL;
9399 raise_encode_exception(&exc, "decimal", unicode,
9400 startpos, startpos+1,
9401 "invalid decimal Unicode string");
9402 Py_XDECREF(exc);
9403 Py_DECREF(unicode);
9404 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009405 }
9406 /* 0-terminate the output string */
9407 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009408 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009409 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009410}
9411
Guido van Rossumd57fd912000-03-10 22:53:23 +00009412/* --- Helpers ------------------------------------------------------------ */
9413
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009414/* helper macro to fixup start/end slice values */
9415#define ADJUST_INDICES(start, end, len) \
9416 if (end > len) \
9417 end = len; \
9418 else if (end < 0) { \
9419 end += len; \
9420 if (end < 0) \
9421 end = 0; \
9422 } \
9423 if (start < 0) { \
9424 start += len; \
9425 if (start < 0) \
9426 start = 0; \
9427 }
9428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009429static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009430any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009431 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009432 Py_ssize_t end,
9433 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009434{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009435 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009436 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009437 Py_ssize_t len1, len2, result;
9438
9439 kind1 = PyUnicode_KIND(s1);
9440 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009441 if (kind1 < kind2)
9442 return -1;
9443
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009444 len1 = PyUnicode_GET_LENGTH(s1);
9445 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009446 ADJUST_INDICES(start, end, len1);
9447 if (end - start < len2)
9448 return -1;
9449
9450 buf1 = PyUnicode_DATA(s1);
9451 buf2 = PyUnicode_DATA(s2);
9452 if (len2 == 1) {
9453 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9454 result = findchar((const char *)buf1 + kind1*start,
9455 kind1, end - start, ch, direction);
9456 if (result == -1)
9457 return -1;
9458 else
9459 return start + result;
9460 }
9461
9462 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009463 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009464 if (!buf2)
9465 return -2;
9466 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467
Victor Stinner794d5672011-10-10 03:21:36 +02009468 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009469 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009470 case PyUnicode_1BYTE_KIND:
9471 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9472 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9473 else
9474 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9475 break;
9476 case PyUnicode_2BYTE_KIND:
9477 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9478 break;
9479 case PyUnicode_4BYTE_KIND:
9480 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9481 break;
9482 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009483 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009484 }
9485 }
9486 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009487 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009488 case PyUnicode_1BYTE_KIND:
9489 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9490 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9491 else
9492 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9493 break;
9494 case PyUnicode_2BYTE_KIND:
9495 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9496 break;
9497 case PyUnicode_4BYTE_KIND:
9498 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9499 break;
9500 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009501 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009502 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503 }
9504
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009505 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009506 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009507 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009508
9509 return result;
9510}
9511
Victor Stinner59423e32018-11-26 13:40:01 +01009512/* _PyUnicode_InsertThousandsGrouping() helper functions */
9513#include "stringlib/localeutil.h"
9514
9515/**
9516 * InsertThousandsGrouping:
9517 * @writer: Unicode writer.
9518 * @n_buffer: Number of characters in @buffer.
9519 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9520 * @d_pos: Start of digits string.
9521 * @n_digits: The number of digits in the string, in which we want
9522 * to put the grouping chars.
9523 * @min_width: The minimum width of the digits in the output string.
9524 * Output will be zero-padded on the left to fill.
9525 * @grouping: see definition in localeconv().
9526 * @thousands_sep: see definition in localeconv().
9527 *
9528 * There are 2 modes: counting and filling. If @writer is NULL,
9529 * we are in counting mode, else filling mode.
9530 * If counting, the required buffer size is returned.
9531 * If filling, we know the buffer will be large enough, so we don't
9532 * need to pass in the buffer size.
9533 * Inserts thousand grouping characters (as defined by grouping and
9534 * thousands_sep) into @writer.
9535 *
9536 * Return value: -1 on error, number of characters otherwise.
9537 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009538Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009539_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009540 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009541 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009542 PyObject *digits,
9543 Py_ssize_t d_pos,
9544 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009545 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009546 const char *grouping,
9547 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009548 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009549{
Xtreak3f7983a2019-01-07 20:39:14 +05309550 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009551 if (writer) {
9552 assert(digits != NULL);
9553 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009554 }
9555 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009556 assert(digits == NULL);
9557 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009558 }
Victor Stinner59423e32018-11-26 13:40:01 +01009559 assert(0 <= d_pos);
9560 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009561 assert(grouping != NULL);
9562
9563 if (digits != NULL) {
9564 if (PyUnicode_READY(digits) == -1) {
9565 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009566 }
Victor Stinner59423e32018-11-26 13:40:01 +01009567 }
9568 if (PyUnicode_READY(thousands_sep) == -1) {
9569 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009570 }
9571
Victor Stinner59423e32018-11-26 13:40:01 +01009572 Py_ssize_t count = 0;
9573 Py_ssize_t n_zeros;
9574 int loop_broken = 0;
9575 int use_separator = 0; /* First time through, don't append the
9576 separator. They only go between
9577 groups. */
9578 Py_ssize_t buffer_pos;
9579 Py_ssize_t digits_pos;
9580 Py_ssize_t len;
9581 Py_ssize_t n_chars;
9582 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9583 be looked at */
9584 /* A generator that returns all of the grouping widths, until it
9585 returns 0. */
9586 GroupGenerator groupgen;
9587 GroupGenerator_init(&groupgen, grouping);
9588 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9589
9590 /* if digits are not grouped, thousands separator
9591 should be an empty string */
9592 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9593
9594 digits_pos = d_pos + n_digits;
9595 if (writer) {
9596 buffer_pos = writer->pos + n_buffer;
9597 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9598 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009599 }
Victor Stinner59423e32018-11-26 13:40:01 +01009600 else {
9601 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009602 }
Victor Stinner59423e32018-11-26 13:40:01 +01009603
9604 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009605 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009606 }
Victor Stinner59423e32018-11-26 13:40:01 +01009607
9608 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9609 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9610 n_zeros = Py_MAX(0, len - remaining);
9611 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9612
9613 /* Use n_zero zero's and n_chars chars */
9614
9615 /* Count only, don't do anything. */
9616 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9617
9618 /* Copy into the writer. */
9619 InsertThousandsGrouping_fill(writer, &buffer_pos,
9620 digits, &digits_pos,
9621 n_chars, n_zeros,
9622 use_separator ? thousands_sep : NULL,
9623 thousands_sep_len, maxchar);
9624
9625 /* Use a separator next time. */
9626 use_separator = 1;
9627
9628 remaining -= n_chars;
9629 min_width -= len;
9630
9631 if (remaining <= 0 && min_width <= 0) {
9632 loop_broken = 1;
9633 break;
9634 }
9635 min_width -= thousands_sep_len;
9636 }
9637 if (!loop_broken) {
9638 /* We left the loop without using a break statement. */
9639
9640 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9641 n_zeros = Py_MAX(0, len - remaining);
9642 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9643
9644 /* Use n_zero zero's and n_chars chars */
9645 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9646
9647 /* Copy into the writer. */
9648 InsertThousandsGrouping_fill(writer, &buffer_pos,
9649 digits, &digits_pos,
9650 n_chars, n_zeros,
9651 use_separator ? thousands_sep : NULL,
9652 thousands_sep_len, maxchar);
9653 }
9654 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009655}
9656
9657
Alexander Belopolsky40018472011-02-26 01:02:56 +00009658Py_ssize_t
9659PyUnicode_Count(PyObject *str,
9660 PyObject *substr,
9661 Py_ssize_t start,
9662 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009663{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009664 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009665 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009666 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009667 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009668
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009669 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009670 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009671
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009672 kind1 = PyUnicode_KIND(str);
9673 kind2 = PyUnicode_KIND(substr);
9674 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009675 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009676
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009677 len1 = PyUnicode_GET_LENGTH(str);
9678 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009679 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009680 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009681 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009682
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009683 buf1 = PyUnicode_DATA(str);
9684 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009685 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009686 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009687 if (!buf2)
9688 goto onError;
9689 }
9690
9691 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009692 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009693 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009694 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009695 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009696 buf2, len2, PY_SSIZE_T_MAX
9697 );
9698 else
9699 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009700 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009701 buf2, len2, PY_SSIZE_T_MAX
9702 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009703 break;
9704 case PyUnicode_2BYTE_KIND:
9705 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009706 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009707 buf2, len2, PY_SSIZE_T_MAX
9708 );
9709 break;
9710 case PyUnicode_4BYTE_KIND:
9711 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009712 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009713 buf2, len2, PY_SSIZE_T_MAX
9714 );
9715 break;
9716 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009717 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009718 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009719
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009720 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009721 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009722 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009723
Guido van Rossumd57fd912000-03-10 22:53:23 +00009724 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009725 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009726 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9727 if (kind2 != kind1)
9728 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009729 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009730}
9731
Alexander Belopolsky40018472011-02-26 01:02:56 +00009732Py_ssize_t
9733PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009734 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009735 Py_ssize_t start,
9736 Py_ssize_t end,
9737 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009738{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009739 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009740 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009741
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009742 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009743}
9744
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009745Py_ssize_t
9746PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9747 Py_ssize_t start, Py_ssize_t end,
9748 int direction)
9749{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009750 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009751 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009752 if (PyUnicode_READY(str) == -1)
9753 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009754 len = PyUnicode_GET_LENGTH(str);
9755 ADJUST_INDICES(start, end, len);
9756 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009757 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009758 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009759 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9760 kind, end-start, ch, direction);
9761 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009762 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009763 else
9764 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009765}
9766
Alexander Belopolsky40018472011-02-26 01:02:56 +00009767static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009768tailmatch(PyObject *self,
9769 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009770 Py_ssize_t start,
9771 Py_ssize_t end,
9772 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009773{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009774 int kind_self;
9775 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009776 const void *data_self;
9777 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009778 Py_ssize_t offset;
9779 Py_ssize_t i;
9780 Py_ssize_t end_sub;
9781
9782 if (PyUnicode_READY(self) == -1 ||
9783 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009784 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009786 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9787 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009788 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009789 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009790
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009791 if (PyUnicode_GET_LENGTH(substring) == 0)
9792 return 1;
9793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009794 kind_self = PyUnicode_KIND(self);
9795 data_self = PyUnicode_DATA(self);
9796 kind_sub = PyUnicode_KIND(substring);
9797 data_sub = PyUnicode_DATA(substring);
9798 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9799
9800 if (direction > 0)
9801 offset = end;
9802 else
9803 offset = start;
9804
9805 if (PyUnicode_READ(kind_self, data_self, offset) ==
9806 PyUnicode_READ(kind_sub, data_sub, 0) &&
9807 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9808 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9809 /* If both are of the same kind, memcmp is sufficient */
9810 if (kind_self == kind_sub) {
9811 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009812 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009813 data_sub,
9814 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009815 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009816 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009817 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009818 else {
9819 /* We do not need to compare 0 and len(substring)-1 because
9820 the if statement above ensured already that they are equal
9821 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009822 for (i = 1; i < end_sub; ++i) {
9823 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9824 PyUnicode_READ(kind_sub, data_sub, i))
9825 return 0;
9826 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009827 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009829 }
9830
9831 return 0;
9832}
9833
Alexander Belopolsky40018472011-02-26 01:02:56 +00009834Py_ssize_t
9835PyUnicode_Tailmatch(PyObject *str,
9836 PyObject *substr,
9837 Py_ssize_t start,
9838 Py_ssize_t end,
9839 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009840{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009841 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009842 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009843
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009844 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009845}
9846
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009847static PyObject *
9848ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009849{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009850 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009851 const char *data = PyUnicode_DATA(self);
9852 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009853 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009854
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009855 res = PyUnicode_New(len, 127);
9856 if (res == NULL)
9857 return NULL;
9858 resdata = PyUnicode_DATA(res);
9859 if (lower)
9860 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009861 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009862 _Py_bytes_upper(resdata, data, len);
9863 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009864}
9865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009867handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009868{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009869 Py_ssize_t j;
9870 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009871 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009872 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009873
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009874 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9875
9876 where ! is a negation and \p{xxx} is a character with property xxx.
9877 */
9878 for (j = i - 1; j >= 0; j--) {
9879 c = PyUnicode_READ(kind, data, j);
9880 if (!_PyUnicode_IsCaseIgnorable(c))
9881 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009882 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009883 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9884 if (final_sigma) {
9885 for (j = i + 1; j < length; j++) {
9886 c = PyUnicode_READ(kind, data, j);
9887 if (!_PyUnicode_IsCaseIgnorable(c))
9888 break;
9889 }
9890 final_sigma = j == length || !_PyUnicode_IsCased(c);
9891 }
9892 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009893}
9894
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009895static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009896lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009897 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009898{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009899 /* Obscure special case. */
9900 if (c == 0x3A3) {
9901 mapped[0] = handle_capital_sigma(kind, data, length, i);
9902 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009903 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009904 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009905}
9906
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009907static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009908do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009909{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009910 Py_ssize_t i, k = 0;
9911 int n_res, j;
9912 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009913
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009914 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009915 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009916 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009917 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009918 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009919 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009920 for (i = 1; i < length; i++) {
9921 c = PyUnicode_READ(kind, data, i);
9922 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9923 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009924 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009925 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009926 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009927 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009928 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009929}
9930
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009931static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009932do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009933 Py_ssize_t i, k = 0;
9934
9935 for (i = 0; i < length; i++) {
9936 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9937 int n_res, j;
9938 if (Py_UNICODE_ISUPPER(c)) {
9939 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9940 }
9941 else if (Py_UNICODE_ISLOWER(c)) {
9942 n_res = _PyUnicode_ToUpperFull(c, mapped);
9943 }
9944 else {
9945 n_res = 1;
9946 mapped[0] = c;
9947 }
9948 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009949 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009950 res[k++] = mapped[j];
9951 }
9952 }
9953 return k;
9954}
9955
9956static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009957do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009958 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009959{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009960 Py_ssize_t i, k = 0;
9961
9962 for (i = 0; i < length; i++) {
9963 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9964 int n_res, j;
9965 if (lower)
9966 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9967 else
9968 n_res = _PyUnicode_ToUpperFull(c, mapped);
9969 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009970 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009971 res[k++] = mapped[j];
9972 }
9973 }
9974 return k;
9975}
9976
9977static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009978do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009979{
9980 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9981}
9982
9983static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009984do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009985{
9986 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9987}
9988
Benjamin Petersone51757f2012-01-12 21:10:29 -05009989static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009990do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -05009991{
9992 Py_ssize_t i, k = 0;
9993
9994 for (i = 0; i < length; i++) {
9995 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9996 Py_UCS4 mapped[3];
9997 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9998 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009999 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010000 res[k++] = mapped[j];
10001 }
10002 }
10003 return k;
10004}
10005
10006static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010007do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -050010008{
10009 Py_ssize_t i, k = 0;
10010 int previous_is_cased;
10011
10012 previous_is_cased = 0;
10013 for (i = 0; i < length; i++) {
10014 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10015 Py_UCS4 mapped[3];
10016 int n_res, j;
10017
10018 if (previous_is_cased)
10019 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10020 else
10021 n_res = _PyUnicode_ToTitleFull(c, mapped);
10022
10023 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010024 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -050010025 res[k++] = mapped[j];
10026 }
10027
10028 previous_is_cased = _PyUnicode_IsCased(c);
10029 }
10030 return k;
10031}
10032
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010033static PyObject *
10034case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010035 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010036{
10037 PyObject *res = NULL;
10038 Py_ssize_t length, newlength = 0;
10039 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010040 const void *data;
10041 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010042 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10043
Benjamin Petersoneea48462012-01-16 14:28:50 -050010044 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010045
10046 kind = PyUnicode_KIND(self);
10047 data = PyUnicode_DATA(self);
10048 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010049 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010050 PyErr_SetString(PyExc_OverflowError, "string is too long");
10051 return NULL;
10052 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -040010053 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010054 if (tmp == NULL)
10055 return PyErr_NoMemory();
10056 newlength = perform(kind, data, length, tmp, &maxchar);
10057 res = PyUnicode_New(newlength, maxchar);
10058 if (res == NULL)
10059 goto leave;
10060 tmpend = tmp + newlength;
10061 outdata = PyUnicode_DATA(res);
10062 outkind = PyUnicode_KIND(res);
10063 switch (outkind) {
10064 case PyUnicode_1BYTE_KIND:
10065 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10066 break;
10067 case PyUnicode_2BYTE_KIND:
10068 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10069 break;
10070 case PyUnicode_4BYTE_KIND:
10071 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10072 break;
10073 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010074 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010075 }
10076 leave:
10077 PyMem_FREE(tmp);
10078 return res;
10079}
10080
Tim Peters8ce9f162004-08-27 01:49:32 +000010081PyObject *
10082PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010083{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010084 PyObject *res;
10085 PyObject *fseq;
10086 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010087 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010088
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010089 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010090 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010091 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010092 }
10093
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010094 /* NOTE: the following code can't call back into Python code,
10095 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010096 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010097
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010098 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010099 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010100 res = _PyUnicode_JoinArray(separator, items, seqlen);
10101 Py_DECREF(fseq);
10102 return res;
10103}
10104
10105PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010106_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010107{
10108 PyObject *res = NULL; /* the result */
10109 PyObject *sep = NULL;
10110 Py_ssize_t seplen;
10111 PyObject *item;
10112 Py_ssize_t sz, i, res_offset;
10113 Py_UCS4 maxchar;
10114 Py_UCS4 item_maxchar;
10115 int use_memcpy;
10116 unsigned char *res_data = NULL, *sep_data = NULL;
10117 PyObject *last_obj;
10118 unsigned int kind = 0;
10119
Tim Peters05eba1f2004-08-27 21:32:02 +000010120 /* If empty sequence, return u"". */
10121 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010122 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010123 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010124
Tim Peters05eba1f2004-08-27 21:32:02 +000010125 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010126 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010127 if (seqlen == 1) {
10128 if (PyUnicode_CheckExact(items[0])) {
10129 res = items[0];
10130 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010131 return res;
10132 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010133 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010134 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010135 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010136 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010137 /* Set up sep and seplen */
10138 if (separator == NULL) {
10139 /* fall back to a blank space separator */
10140 sep = PyUnicode_FromOrdinal(' ');
10141 if (!sep)
10142 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010143 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010144 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010145 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010146 else {
10147 if (!PyUnicode_Check(separator)) {
10148 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010149 "separator: expected str instance,"
10150 " %.80s found",
10151 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010152 goto onError;
10153 }
10154 if (PyUnicode_READY(separator))
10155 goto onError;
10156 sep = separator;
10157 seplen = PyUnicode_GET_LENGTH(separator);
10158 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10159 /* inc refcount to keep this code path symmetric with the
10160 above case of a blank separator */
10161 Py_INCREF(sep);
10162 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010163 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010164 }
10165
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010166 /* There are at least two things to join, or else we have a subclass
10167 * of str in the sequence.
10168 * Do a pre-pass to figure out the total amount of space we'll
10169 * need (sz), and see whether all argument are strings.
10170 */
10171 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010172#ifdef Py_DEBUG
10173 use_memcpy = 0;
10174#else
10175 use_memcpy = 1;
10176#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010177 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010178 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010179 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010180 if (!PyUnicode_Check(item)) {
10181 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010182 "sequence item %zd: expected str instance,"
10183 " %.80s found",
10184 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010185 goto onError;
10186 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 if (PyUnicode_READY(item) == -1)
10188 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010189 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010191 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010192 if (i != 0) {
10193 add_sz += seplen;
10194 }
10195 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010196 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010197 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010198 goto onError;
10199 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010200 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010201 if (use_memcpy && last_obj != NULL) {
10202 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10203 use_memcpy = 0;
10204 }
10205 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010206 }
Tim Petersced69f82003-09-16 20:30:58 +000010207
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010208 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010209 if (res == NULL)
10210 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010211
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010212 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010213#ifdef Py_DEBUG
10214 use_memcpy = 0;
10215#else
10216 if (use_memcpy) {
10217 res_data = PyUnicode_1BYTE_DATA(res);
10218 kind = PyUnicode_KIND(res);
10219 if (seplen != 0)
10220 sep_data = PyUnicode_1BYTE_DATA(sep);
10221 }
10222#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010223 if (use_memcpy) {
10224 for (i = 0; i < seqlen; ++i) {
10225 Py_ssize_t itemlen;
10226 item = items[i];
10227
10228 /* Copy item, and maybe the separator. */
10229 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010230 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010231 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010232 kind * seplen);
10233 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010234 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010235
10236 itemlen = PyUnicode_GET_LENGTH(item);
10237 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010238 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010239 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010240 kind * itemlen);
10241 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010242 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010243 }
10244 assert(res_data == PyUnicode_1BYTE_DATA(res)
10245 + kind * PyUnicode_GET_LENGTH(res));
10246 }
10247 else {
10248 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10249 Py_ssize_t itemlen;
10250 item = items[i];
10251
10252 /* Copy item, and maybe the separator. */
10253 if (i && seplen != 0) {
10254 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10255 res_offset += seplen;
10256 }
10257
10258 itemlen = PyUnicode_GET_LENGTH(item);
10259 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010260 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010261 res_offset += itemlen;
10262 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010263 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010264 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010265 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010266
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010268 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010270
Benjamin Peterson29060642009-01-31 22:14:21 +000010271 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010273 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010274 return NULL;
10275}
10276
Victor Stinnerd3f08822012-05-29 12:57:52 +020010277void
10278_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10279 Py_UCS4 fill_char)
10280{
10281 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010282 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010283 assert(PyUnicode_IS_READY(unicode));
10284 assert(unicode_modifiable(unicode));
10285 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10286 assert(start >= 0);
10287 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010288 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010289}
10290
Victor Stinner3fe55312012-01-04 00:33:50 +010010291Py_ssize_t
10292PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10293 Py_UCS4 fill_char)
10294{
10295 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010296
10297 if (!PyUnicode_Check(unicode)) {
10298 PyErr_BadInternalCall();
10299 return -1;
10300 }
10301 if (PyUnicode_READY(unicode) == -1)
10302 return -1;
10303 if (unicode_check_modifiable(unicode))
10304 return -1;
10305
Victor Stinnerd3f08822012-05-29 12:57:52 +020010306 if (start < 0) {
10307 PyErr_SetString(PyExc_IndexError, "string index out of range");
10308 return -1;
10309 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010310 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10311 PyErr_SetString(PyExc_ValueError,
10312 "fill character is bigger than "
10313 "the string maximum character");
10314 return -1;
10315 }
10316
10317 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10318 length = Py_MIN(maxlen, length);
10319 if (length <= 0)
10320 return 0;
10321
Victor Stinnerd3f08822012-05-29 12:57:52 +020010322 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010323 return length;
10324}
10325
Victor Stinner9310abb2011-10-05 00:59:23 +020010326static PyObject *
10327pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010328 Py_ssize_t left,
10329 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010331{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010332 PyObject *u;
10333 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010334 int kind;
10335 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010336
10337 if (left < 0)
10338 left = 0;
10339 if (right < 0)
10340 right = 0;
10341
Victor Stinnerc4b49542011-12-11 22:44:26 +010010342 if (left == 0 && right == 0)
10343 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010345 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10346 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010347 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10348 return NULL;
10349 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010351 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010352 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010353 if (!u)
10354 return NULL;
10355
10356 kind = PyUnicode_KIND(u);
10357 data = PyUnicode_DATA(u);
10358 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010359 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010360 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010361 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010362 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010363 assert(_PyUnicode_CheckConsistency(u, 1));
10364 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010365}
10366
Alexander Belopolsky40018472011-02-26 01:02:56 +000010367PyObject *
10368PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010369{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010370 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010371
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010372 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010373 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010374
Benjamin Petersonead6b532011-12-20 17:23:42 -060010375 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010377 if (PyUnicode_IS_ASCII(string))
10378 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010379 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010380 PyUnicode_GET_LENGTH(string), keepends);
10381 else
10382 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010383 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010384 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010385 break;
10386 case PyUnicode_2BYTE_KIND:
10387 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010388 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 PyUnicode_GET_LENGTH(string), keepends);
10390 break;
10391 case PyUnicode_4BYTE_KIND:
10392 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010393 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 PyUnicode_GET_LENGTH(string), keepends);
10395 break;
10396 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010397 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010399 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010400}
10401
Alexander Belopolsky40018472011-02-26 01:02:56 +000010402static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010403split(PyObject *self,
10404 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010405 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010406{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010407 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010408 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010409 Py_ssize_t len1, len2;
10410 PyObject* out;
10411
Guido van Rossumd57fd912000-03-10 22:53:23 +000010412 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010413 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 if (PyUnicode_READY(self) == -1)
10416 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010417
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010419 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010420 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010421 if (PyUnicode_IS_ASCII(self))
10422 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010423 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010424 PyUnicode_GET_LENGTH(self), maxcount
10425 );
10426 else
10427 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010428 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010429 PyUnicode_GET_LENGTH(self), maxcount
10430 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431 case PyUnicode_2BYTE_KIND:
10432 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010433 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010434 PyUnicode_GET_LENGTH(self), maxcount
10435 );
10436 case PyUnicode_4BYTE_KIND:
10437 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010438 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 PyUnicode_GET_LENGTH(self), maxcount
10440 );
10441 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010442 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010443 }
10444
10445 if (PyUnicode_READY(substring) == -1)
10446 return NULL;
10447
10448 kind1 = PyUnicode_KIND(self);
10449 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 len1 = PyUnicode_GET_LENGTH(self);
10451 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010452 if (kind1 < kind2 || len1 < len2) {
10453 out = PyList_New(1);
10454 if (out == NULL)
10455 return NULL;
10456 Py_INCREF(self);
10457 PyList_SET_ITEM(out, 0, self);
10458 return out;
10459 }
10460 buf1 = PyUnicode_DATA(self);
10461 buf2 = PyUnicode_DATA(substring);
10462 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010463 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010464 if (!buf2)
10465 return NULL;
10466 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010468 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010469 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010470 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10471 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010472 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010473 else
10474 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010475 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 break;
10477 case PyUnicode_2BYTE_KIND:
10478 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010479 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480 break;
10481 case PyUnicode_4BYTE_KIND:
10482 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010483 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484 break;
10485 default:
10486 out = NULL;
10487 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010488 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010489 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010490 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010491 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492}
10493
Alexander Belopolsky40018472011-02-26 01:02:56 +000010494static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010495rsplit(PyObject *self,
10496 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010497 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010498{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010499 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010500 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501 Py_ssize_t len1, len2;
10502 PyObject* out;
10503
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010504 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010505 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010506
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 if (PyUnicode_READY(self) == -1)
10508 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010511 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010513 if (PyUnicode_IS_ASCII(self))
10514 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010515 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010516 PyUnicode_GET_LENGTH(self), maxcount
10517 );
10518 else
10519 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010520 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010521 PyUnicode_GET_LENGTH(self), maxcount
10522 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 case PyUnicode_2BYTE_KIND:
10524 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010525 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010526 PyUnicode_GET_LENGTH(self), maxcount
10527 );
10528 case PyUnicode_4BYTE_KIND:
10529 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010530 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010531 PyUnicode_GET_LENGTH(self), maxcount
10532 );
10533 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010534 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 }
10536
10537 if (PyUnicode_READY(substring) == -1)
10538 return NULL;
10539
10540 kind1 = PyUnicode_KIND(self);
10541 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 len1 = PyUnicode_GET_LENGTH(self);
10543 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010544 if (kind1 < kind2 || len1 < len2) {
10545 out = PyList_New(1);
10546 if (out == NULL)
10547 return NULL;
10548 Py_INCREF(self);
10549 PyList_SET_ITEM(out, 0, self);
10550 return out;
10551 }
10552 buf1 = PyUnicode_DATA(self);
10553 buf2 = PyUnicode_DATA(substring);
10554 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010555 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010556 if (!buf2)
10557 return NULL;
10558 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010559
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010560 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010562 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10563 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010564 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010565 else
10566 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010567 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010568 break;
10569 case PyUnicode_2BYTE_KIND:
10570 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010571 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 break;
10573 case PyUnicode_4BYTE_KIND:
10574 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010575 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 break;
10577 default:
10578 out = NULL;
10579 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010580 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010581 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010582 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010583 return out;
10584}
10585
10586static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010587anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10588 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010590 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010592 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10593 return asciilib_find(buf1, len1, buf2, len2, offset);
10594 else
10595 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 case PyUnicode_2BYTE_KIND:
10597 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10598 case PyUnicode_4BYTE_KIND:
10599 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10600 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010601 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602}
10603
10604static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010605anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10606 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010608 switch (kind) {
10609 case PyUnicode_1BYTE_KIND:
10610 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10611 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10612 else
10613 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10614 case PyUnicode_2BYTE_KIND:
10615 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10616 case PyUnicode_4BYTE_KIND:
10617 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10618 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010619 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010620}
10621
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010622static void
10623replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10624 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10625{
10626 int kind = PyUnicode_KIND(u);
10627 void *data = PyUnicode_DATA(u);
10628 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10629 if (kind == PyUnicode_1BYTE_KIND) {
10630 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10631 (Py_UCS1 *)data + len,
10632 u1, u2, maxcount);
10633 }
10634 else if (kind == PyUnicode_2BYTE_KIND) {
10635 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10636 (Py_UCS2 *)data + len,
10637 u1, u2, maxcount);
10638 }
10639 else {
10640 assert(kind == PyUnicode_4BYTE_KIND);
10641 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10642 (Py_UCS4 *)data + len,
10643 u1, u2, maxcount);
10644 }
10645}
10646
Alexander Belopolsky40018472011-02-26 01:02:56 +000010647static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648replace(PyObject *self, PyObject *str1,
10649 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010650{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010652 const char *sbuf = PyUnicode_DATA(self);
10653 const void *buf1 = PyUnicode_DATA(str1);
10654 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 int srelease = 0, release1 = 0, release2 = 0;
10656 int skind = PyUnicode_KIND(self);
10657 int kind1 = PyUnicode_KIND(str1);
10658 int kind2 = PyUnicode_KIND(str2);
10659 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10660 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10661 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010662 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010663 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010664
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010665 if (slen < len1)
10666 goto nothing;
10667
Guido van Rossumd57fd912000-03-10 22:53:23 +000010668 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010669 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010670 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010671 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010672
Victor Stinner59de0ee2011-10-07 10:01:28 +020010673 if (str1 == str2)
10674 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010675
Victor Stinner49a0a212011-10-12 23:46:10 +020010676 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010677 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10678 if (maxchar < maxchar_str1)
10679 /* substring too wide to be present */
10680 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010681 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10682 /* Replacing str1 with str2 may cause a maxchar reduction in the
10683 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010684 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010685 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010686
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010688 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010689 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010690 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010691 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010692 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010693 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010694 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010695
Victor Stinner69ed0f42013-04-09 21:48:24 +020010696 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010697 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010698 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010699 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010700 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010701 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010702 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010703 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010704
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010705 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10706 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010707 }
10708 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010709 int rkind = skind;
10710 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010711 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010712
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010713 if (kind1 < rkind) {
10714 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010715 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716 if (!buf1) goto error;
10717 release1 = 1;
10718 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010719 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010720 if (i < 0)
10721 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010722 if (rkind > kind2) {
10723 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010724 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010725 if (!buf2) goto error;
10726 release2 = 1;
10727 }
10728 else if (rkind < kind2) {
10729 /* widen self and buf1 */
10730 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010731 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010732 assert(buf1 != PyUnicode_DATA(str1));
10733 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010734 buf1 = PyUnicode_DATA(str1);
10735 release1 = 0;
10736 }
10737 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 if (!sbuf) goto error;
10739 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010740 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010741 if (!buf1) goto error;
10742 release1 = 1;
10743 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010744 u = PyUnicode_New(slen, maxchar);
10745 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010746 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010747 assert(PyUnicode_KIND(u) == rkind);
10748 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010749
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010750 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010751 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010752 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010753 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010754 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010755 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010756
10757 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010758 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010759 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010760 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010761 if (i == -1)
10762 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010763 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010764 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010765 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010766 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010767 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010769 }
10770 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010771 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010772 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010773 int rkind = skind;
10774 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010777 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010778 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010779 if (!buf1) goto error;
10780 release1 = 1;
10781 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010782 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010783 if (n == 0)
10784 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010785 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010786 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010787 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010788 if (!buf2) goto error;
10789 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010790 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010791 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010792 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010793 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010794 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010795 if (!sbuf) goto error;
10796 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010797 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010798 assert(buf1 != PyUnicode_DATA(str1));
10799 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010800 buf1 = PyUnicode_DATA(str1);
10801 release1 = 0;
10802 }
10803 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010804 if (!buf1) goto error;
10805 release1 = 1;
10806 }
10807 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10808 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010809 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010810 PyErr_SetString(PyExc_OverflowError,
10811 "replace string is too long");
10812 goto error;
10813 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010814 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010815 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010816 _Py_INCREF_UNICODE_EMPTY();
10817 if (!unicode_empty)
10818 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010819 u = unicode_empty;
10820 goto done;
10821 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010822 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010823 PyErr_SetString(PyExc_OverflowError,
10824 "replace string is too long");
10825 goto error;
10826 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010827 u = PyUnicode_New(new_size, maxchar);
10828 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010829 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010830 assert(PyUnicode_KIND(u) == rkind);
10831 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010832 ires = i = 0;
10833 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010834 while (n-- > 0) {
10835 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010836 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010837 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010838 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010839 if (j == -1)
10840 break;
10841 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010842 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010843 memcpy(res + rkind * ires,
10844 sbuf + rkind * i,
10845 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010846 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010847 }
10848 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010849 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010850 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010851 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010852 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010853 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010854 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010855 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010856 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010857 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010858 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010859 memcpy(res + rkind * ires,
10860 sbuf + rkind * i,
10861 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010862 }
10863 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010864 /* interleave */
10865 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010866 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010867 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010868 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010869 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010870 if (--n <= 0)
10871 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010872 memcpy(res + rkind * ires,
10873 sbuf + rkind * i,
10874 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010875 ires++;
10876 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010877 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010878 memcpy(res + rkind * ires,
10879 sbuf + rkind * i,
10880 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010881 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010882 }
10883
10884 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010885 unicode_adjust_maxchar(&u);
10886 if (u == NULL)
10887 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010888 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010889
10890 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010891 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10892 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10893 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010894 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010895 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010896 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010897 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010898 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010899 PyMem_FREE((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010900 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010901 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010902
Benjamin Peterson29060642009-01-31 22:14:21 +000010903 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010904 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010905 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10906 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10907 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010908 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010909 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010910 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010911 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010912 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010913 PyMem_FREE((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010914 return unicode_result_unchanged(self);
10915
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010916 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010917 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10918 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10919 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10920 if (srelease)
10921 PyMem_FREE((void *)sbuf);
10922 if (release1)
10923 PyMem_FREE((void *)buf1);
10924 if (release2)
10925 PyMem_FREE((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010926 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927}
10928
10929/* --- Unicode Object Methods --------------------------------------------- */
10930
INADA Naoki3ae20562017-01-16 20:41:20 +090010931/*[clinic input]
10932str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010933
INADA Naoki3ae20562017-01-16 20:41:20 +090010934Return a version of the string where each word is titlecased.
10935
10936More specifically, words start with uppercased characters and all remaining
10937cased characters have lower case.
10938[clinic start generated code]*/
10939
10940static PyObject *
10941unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010942/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010943{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010944 if (PyUnicode_READY(self) == -1)
10945 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010946 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010947}
10948
INADA Naoki3ae20562017-01-16 20:41:20 +090010949/*[clinic input]
10950str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010951
INADA Naoki3ae20562017-01-16 20:41:20 +090010952Return a capitalized version of the string.
10953
10954More specifically, make the first character have upper case and the rest lower
10955case.
10956[clinic start generated code]*/
10957
10958static PyObject *
10959unicode_capitalize_impl(PyObject *self)
10960/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010962 if (PyUnicode_READY(self) == -1)
10963 return NULL;
10964 if (PyUnicode_GET_LENGTH(self) == 0)
10965 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010966 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967}
10968
INADA Naoki3ae20562017-01-16 20:41:20 +090010969/*[clinic input]
10970str.casefold as unicode_casefold
10971
10972Return a version of the string suitable for caseless comparisons.
10973[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010974
10975static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010976unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010977/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010978{
10979 if (PyUnicode_READY(self) == -1)
10980 return NULL;
10981 if (PyUnicode_IS_ASCII(self))
10982 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010983 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010984}
10985
10986
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010987/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010988
10989static int
10990convert_uc(PyObject *obj, void *addr)
10991{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010992 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010993
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010994 if (!PyUnicode_Check(obj)) {
10995 PyErr_Format(PyExc_TypeError,
10996 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010997 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010998 return 0;
10999 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011000 if (PyUnicode_READY(obj) < 0)
11001 return 0;
11002 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011003 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011004 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000011005 return 0;
11006 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011007 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011008 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011009}
11010
INADA Naoki3ae20562017-01-16 20:41:20 +090011011/*[clinic input]
11012str.center as unicode_center
11013
11014 width: Py_ssize_t
11015 fillchar: Py_UCS4 = ' '
11016 /
11017
11018Return a centered string of length width.
11019
11020Padding is done using the specified fill character (default is a space).
11021[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011022
11023static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011024unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11025/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011027 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011028
Benjamin Petersonbac79492012-01-14 13:34:47 -050011029 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011030 return NULL;
11031
Victor Stinnerc4b49542011-12-11 22:44:26 +010011032 if (PyUnicode_GET_LENGTH(self) >= width)
11033 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011034
Victor Stinnerc4b49542011-12-11 22:44:26 +010011035 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011036 left = marg / 2 + (marg & width & 1);
11037
Victor Stinner9310abb2011-10-05 00:59:23 +020011038 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039}
11040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011041/* This function assumes that str1 and str2 are readied by the caller. */
11042
Marc-André Lemburge5034372000-08-08 08:04:29 +000011043static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011044unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011045{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011046#define COMPARE(TYPE1, TYPE2) \
11047 do { \
11048 TYPE1* p1 = (TYPE1 *)data1; \
11049 TYPE2* p2 = (TYPE2 *)data2; \
11050 TYPE1* end = p1 + len; \
11051 Py_UCS4 c1, c2; \
11052 for (; p1 != end; p1++, p2++) { \
11053 c1 = *p1; \
11054 c2 = *p2; \
11055 if (c1 != c2) \
11056 return (c1 < c2) ? -1 : 1; \
11057 } \
11058 } \
11059 while (0)
11060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011061 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011062 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011063 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011064
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011065 kind1 = PyUnicode_KIND(str1);
11066 kind2 = PyUnicode_KIND(str2);
11067 data1 = PyUnicode_DATA(str1);
11068 data2 = PyUnicode_DATA(str2);
11069 len1 = PyUnicode_GET_LENGTH(str1);
11070 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011071 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011072
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011073 switch(kind1) {
11074 case PyUnicode_1BYTE_KIND:
11075 {
11076 switch(kind2) {
11077 case PyUnicode_1BYTE_KIND:
11078 {
11079 int cmp = memcmp(data1, data2, len);
11080 /* normalize result of memcmp() into the range [-1; 1] */
11081 if (cmp < 0)
11082 return -1;
11083 if (cmp > 0)
11084 return 1;
11085 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011086 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011087 case PyUnicode_2BYTE_KIND:
11088 COMPARE(Py_UCS1, Py_UCS2);
11089 break;
11090 case PyUnicode_4BYTE_KIND:
11091 COMPARE(Py_UCS1, Py_UCS4);
11092 break;
11093 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011094 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011095 }
11096 break;
11097 }
11098 case PyUnicode_2BYTE_KIND:
11099 {
11100 switch(kind2) {
11101 case PyUnicode_1BYTE_KIND:
11102 COMPARE(Py_UCS2, Py_UCS1);
11103 break;
11104 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011105 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011106 COMPARE(Py_UCS2, Py_UCS2);
11107 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011108 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011109 case PyUnicode_4BYTE_KIND:
11110 COMPARE(Py_UCS2, Py_UCS4);
11111 break;
11112 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011113 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011114 }
11115 break;
11116 }
11117 case PyUnicode_4BYTE_KIND:
11118 {
11119 switch(kind2) {
11120 case PyUnicode_1BYTE_KIND:
11121 COMPARE(Py_UCS4, Py_UCS1);
11122 break;
11123 case PyUnicode_2BYTE_KIND:
11124 COMPARE(Py_UCS4, Py_UCS2);
11125 break;
11126 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011127 {
11128#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11129 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11130 /* normalize result of wmemcmp() into the range [-1; 1] */
11131 if (cmp < 0)
11132 return -1;
11133 if (cmp > 0)
11134 return 1;
11135#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011136 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011137#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011138 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011139 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011140 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011141 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011142 }
11143 break;
11144 }
11145 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011146 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011147 }
11148
Victor Stinner770e19e2012-10-04 22:59:45 +020011149 if (len1 == len2)
11150 return 0;
11151 if (len1 < len2)
11152 return -1;
11153 else
11154 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011155
11156#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011157}
11158
Benjamin Peterson621b4302016-09-09 13:54:34 -070011159static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011160unicode_compare_eq(PyObject *str1, PyObject *str2)
11161{
11162 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011163 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011164 Py_ssize_t len;
11165 int cmp;
11166
Victor Stinnere5567ad2012-10-23 02:48:49 +020011167 len = PyUnicode_GET_LENGTH(str1);
11168 if (PyUnicode_GET_LENGTH(str2) != len)
11169 return 0;
11170 kind = PyUnicode_KIND(str1);
11171 if (PyUnicode_KIND(str2) != kind)
11172 return 0;
11173 data1 = PyUnicode_DATA(str1);
11174 data2 = PyUnicode_DATA(str2);
11175
11176 cmp = memcmp(data1, data2, len * kind);
11177 return (cmp == 0);
11178}
11179
11180
Alexander Belopolsky40018472011-02-26 01:02:56 +000011181int
11182PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011184 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11185 if (PyUnicode_READY(left) == -1 ||
11186 PyUnicode_READY(right) == -1)
11187 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011188
11189 /* a string is equal to itself */
11190 if (left == right)
11191 return 0;
11192
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011193 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011194 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011195 PyErr_Format(PyExc_TypeError,
11196 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011197 Py_TYPE(left)->tp_name,
11198 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199 return -1;
11200}
11201
Martin v. Löwis5b222132007-06-10 09:51:05 +000011202int
11203PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11204{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011205 Py_ssize_t i;
11206 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011207 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011208 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011209
Victor Stinner910337b2011-10-03 03:20:16 +020011210 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011211 if (!PyUnicode_IS_READY(uni)) {
11212 const wchar_t *ws = _PyUnicode_WSTR(uni);
11213 /* Compare Unicode string and source character set string */
11214 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11215 if (chr != ustr[i])
11216 return (chr < ustr[i]) ? -1 : 1;
11217 }
11218 /* This check keeps Python strings that end in '\0' from comparing equal
11219 to C strings identical up to that point. */
11220 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11221 return 1; /* uni is longer */
11222 if (ustr[i])
11223 return -1; /* str is longer */
11224 return 0;
11225 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011226 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011227 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011228 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011229 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011230 size_t len, len2 = strlen(str);
11231 int cmp;
11232
11233 len = Py_MIN(len1, len2);
11234 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011235 if (cmp != 0) {
11236 if (cmp < 0)
11237 return -1;
11238 else
11239 return 1;
11240 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011241 if (len1 > len2)
11242 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011243 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011244 return -1; /* str is longer */
11245 return 0;
11246 }
11247 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011248 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011249 /* Compare Unicode string and source character set string */
11250 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011251 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011252 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11253 /* This check keeps Python strings that end in '\0' from comparing equal
11254 to C strings identical up to that point. */
11255 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11256 return 1; /* uni is longer */
11257 if (str[i])
11258 return -1; /* str is longer */
11259 return 0;
11260 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011261}
11262
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011263static int
11264non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11265{
11266 size_t i, len;
11267 const wchar_t *p;
11268 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11269 if (strlen(str) != len)
11270 return 0;
11271 p = _PyUnicode_WSTR(unicode);
11272 assert(p);
11273 for (i = 0; i < len; i++) {
11274 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011275 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011276 return 0;
11277 }
11278 return 1;
11279}
11280
11281int
11282_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11283{
11284 size_t len;
11285 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011286 assert(str);
11287#ifndef NDEBUG
11288 for (const char *p = str; *p; p++) {
11289 assert((unsigned char)*p < 128);
11290 }
11291#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011292 if (PyUnicode_READY(unicode) == -1) {
11293 /* Memory error or bad data */
11294 PyErr_Clear();
11295 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11296 }
11297 if (!PyUnicode_IS_ASCII(unicode))
11298 return 0;
11299 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11300 return strlen(str) == len &&
11301 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11302}
11303
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011304int
11305_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11306{
11307 PyObject *right_uni;
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011308
11309 assert(_PyUnicode_CHECK(left));
11310 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011311#ifndef NDEBUG
11312 for (const char *p = right->string; *p; p++) {
11313 assert((unsigned char)*p < 128);
11314 }
11315#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011316
11317 if (PyUnicode_READY(left) == -1) {
11318 /* memory error or bad data */
11319 PyErr_Clear();
11320 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11321 }
11322
11323 if (!PyUnicode_IS_ASCII(left))
11324 return 0;
11325
11326 right_uni = _PyUnicode_FromId(right); /* borrowed */
11327 if (right_uni == NULL) {
11328 /* memory error or bad data */
11329 PyErr_Clear();
11330 return _PyUnicode_EqualToASCIIString(left, right->string);
11331 }
11332
11333 if (left == right_uni)
11334 return 1;
11335
11336 if (PyUnicode_CHECK_INTERNED(left))
11337 return 0;
11338
Victor Stinner607b1022020-05-05 18:50:30 +020011339#ifdef INTERNED_STRINGS
INADA Naoki7cc95f52018-01-28 02:07:09 +090011340 assert(_PyUnicode_HASH(right_uni) != -1);
Victor Stinner607b1022020-05-05 18:50:30 +020011341 Py_hash_t hash = _PyUnicode_HASH(left);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011342 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11343 return 0;
Victor Stinner607b1022020-05-05 18:50:30 +020011344#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011345
11346 return unicode_compare_eq(left, right_uni);
11347}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011348
Alexander Belopolsky40018472011-02-26 01:02:56 +000011349PyObject *
11350PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011351{
11352 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011353
Victor Stinnere5567ad2012-10-23 02:48:49 +020011354 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11355 Py_RETURN_NOTIMPLEMENTED;
11356
11357 if (PyUnicode_READY(left) == -1 ||
11358 PyUnicode_READY(right) == -1)
11359 return NULL;
11360
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011361 if (left == right) {
11362 switch (op) {
11363 case Py_EQ:
11364 case Py_LE:
11365 case Py_GE:
11366 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011367 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011368 case Py_NE:
11369 case Py_LT:
11370 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011371 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011372 default:
11373 PyErr_BadArgument();
11374 return NULL;
11375 }
11376 }
11377 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011378 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011379 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011380 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011381 }
11382 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011383 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011384 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011385 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011386}
11387
Alexander Belopolsky40018472011-02-26 01:02:56 +000011388int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011389_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11390{
11391 return unicode_eq(aa, bb);
11392}
11393
11394int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011395PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011396{
Victor Stinner77282cb2013-04-14 19:22:47 +020011397 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011398 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011399 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011400 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011401
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011402 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011403 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011404 "'in <string>' requires string as left operand, not %.100s",
11405 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011406 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011407 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011408 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011409 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011410 if (ensure_unicode(str) < 0)
11411 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011413 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011414 kind2 = PyUnicode_KIND(substr);
11415 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011416 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011417 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011418 len2 = PyUnicode_GET_LENGTH(substr);
11419 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011420 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011421 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011422 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011423 if (len2 == 1) {
11424 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11425 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011426 return result;
11427 }
11428 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011429 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011430 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011431 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011432 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011433
Victor Stinner77282cb2013-04-14 19:22:47 +020011434 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011435 case PyUnicode_1BYTE_KIND:
11436 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11437 break;
11438 case PyUnicode_2BYTE_KIND:
11439 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11440 break;
11441 case PyUnicode_4BYTE_KIND:
11442 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11443 break;
11444 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011445 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011446 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011447
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011448 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011449 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011450 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011451
Guido van Rossum403d68b2000-03-13 15:55:09 +000011452 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011453}
11454
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455/* Concat to string or Unicode object giving a new Unicode object. */
11456
Alexander Belopolsky40018472011-02-26 01:02:56 +000011457PyObject *
11458PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011460 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011461 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011462 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011464 if (ensure_unicode(left) < 0)
11465 return NULL;
11466
11467 if (!PyUnicode_Check(right)) {
11468 PyErr_Format(PyExc_TypeError,
11469 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011470 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011471 return NULL;
11472 }
11473 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011474 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475
11476 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011477 if (left == unicode_empty)
11478 return PyUnicode_FromObject(right);
11479 if (right == unicode_empty)
11480 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011482 left_len = PyUnicode_GET_LENGTH(left);
11483 right_len = PyUnicode_GET_LENGTH(right);
11484 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011485 PyErr_SetString(PyExc_OverflowError,
11486 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011487 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011488 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011489 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011490
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011491 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11492 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011493 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011494
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011496 result = PyUnicode_New(new_len, maxchar);
11497 if (result == NULL)
11498 return NULL;
11499 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11500 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11501 assert(_PyUnicode_CheckConsistency(result, 1));
11502 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503}
11504
Walter Dörwald1ab83302007-05-18 17:15:44 +000011505void
Victor Stinner23e56682011-10-03 03:54:37 +020011506PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011507{
Victor Stinner23e56682011-10-03 03:54:37 +020011508 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011509 Py_UCS4 maxchar, maxchar2;
11510 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011511
11512 if (p_left == NULL) {
11513 if (!PyErr_Occurred())
11514 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011515 return;
11516 }
Victor Stinner23e56682011-10-03 03:54:37 +020011517 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011518 if (right == NULL || left == NULL
11519 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011520 if (!PyErr_Occurred())
11521 PyErr_BadInternalCall();
11522 goto error;
11523 }
11524
Benjamin Petersonbac79492012-01-14 13:34:47 -050011525 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011526 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011527 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011528 goto error;
11529
Victor Stinner488fa492011-12-12 00:01:39 +010011530 /* Shortcuts */
11531 if (left == unicode_empty) {
11532 Py_DECREF(left);
11533 Py_INCREF(right);
11534 *p_left = right;
11535 return;
11536 }
11537 if (right == unicode_empty)
11538 return;
11539
11540 left_len = PyUnicode_GET_LENGTH(left);
11541 right_len = PyUnicode_GET_LENGTH(right);
11542 if (left_len > PY_SSIZE_T_MAX - right_len) {
11543 PyErr_SetString(PyExc_OverflowError,
11544 "strings are too large to concat");
11545 goto error;
11546 }
11547 new_len = left_len + right_len;
11548
11549 if (unicode_modifiable(left)
11550 && PyUnicode_CheckExact(right)
11551 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011552 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11553 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011554 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011555 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011556 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11557 {
11558 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011559 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011560 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011561
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011562 /* copy 'right' into the newly allocated area of 'left' */
11563 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011564 }
Victor Stinner488fa492011-12-12 00:01:39 +010011565 else {
11566 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11567 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011568 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011569
Victor Stinner488fa492011-12-12 00:01:39 +010011570 /* Concat the two Unicode strings */
11571 res = PyUnicode_New(new_len, maxchar);
11572 if (res == NULL)
11573 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011574 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11575 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011576 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011577 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011578 }
11579 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011580 return;
11581
11582error:
Victor Stinner488fa492011-12-12 00:01:39 +010011583 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011584}
11585
11586void
11587PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11588{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011589 PyUnicode_Append(pleft, right);
11590 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011591}
11592
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011593/*
11594Wraps stringlib_parse_args_finds() and additionally ensures that the
11595first argument is a unicode object.
11596*/
11597
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011598static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011599parse_args_finds_unicode(const char * function_name, PyObject *args,
11600 PyObject **substring,
11601 Py_ssize_t *start, Py_ssize_t *end)
11602{
11603 if(stringlib_parse_args_finds(function_name, args, substring,
11604 start, end)) {
11605 if (ensure_unicode(*substring) < 0)
11606 return 0;
11607 return 1;
11608 }
11609 return 0;
11610}
11611
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011612PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011613 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011615Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011616string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011617interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618
11619static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011620unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011622 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011623 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011624 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011626 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011627 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011628 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011630 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011631 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011633 kind1 = PyUnicode_KIND(self);
11634 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011635 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011636 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011638 len1 = PyUnicode_GET_LENGTH(self);
11639 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011640 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011641 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011642 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011643
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011644 buf1 = PyUnicode_DATA(self);
11645 buf2 = PyUnicode_DATA(substring);
11646 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011647 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011648 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011649 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011650 }
11651 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011652 case PyUnicode_1BYTE_KIND:
11653 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011654 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011655 buf2, len2, PY_SSIZE_T_MAX
11656 );
11657 break;
11658 case PyUnicode_2BYTE_KIND:
11659 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011660 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011661 buf2, len2, PY_SSIZE_T_MAX
11662 );
11663 break;
11664 case PyUnicode_4BYTE_KIND:
11665 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011666 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011667 buf2, len2, PY_SSIZE_T_MAX
11668 );
11669 break;
11670 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011671 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011672 }
11673
11674 result = PyLong_FromSsize_t(iresult);
11675
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011676 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011677 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011678 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011679
Guido van Rossumd57fd912000-03-10 22:53:23 +000011680 return result;
11681}
11682
INADA Naoki3ae20562017-01-16 20:41:20 +090011683/*[clinic input]
11684str.encode as unicode_encode
11685
11686 encoding: str(c_default="NULL") = 'utf-8'
11687 The encoding in which to encode the string.
11688 errors: str(c_default="NULL") = 'strict'
11689 The error handling scheme to use for encoding errors.
11690 The default is 'strict' meaning that encoding errors raise a
11691 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11692 'xmlcharrefreplace' as well as any other name registered with
11693 codecs.register_error that can handle UnicodeEncodeErrors.
11694
11695Encode the string using the codec registered for encoding.
11696[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697
11698static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011699unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011700/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011701{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011702 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011703}
11704
INADA Naoki3ae20562017-01-16 20:41:20 +090011705/*[clinic input]
11706str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707
INADA Naoki3ae20562017-01-16 20:41:20 +090011708 tabsize: int = 8
11709
11710Return a copy where all tab characters are expanded using spaces.
11711
11712If tabsize is not given, a tab size of 8 characters is assumed.
11713[clinic start generated code]*/
11714
11715static PyObject *
11716unicode_expandtabs_impl(PyObject *self, int tabsize)
11717/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011718{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011719 Py_ssize_t i, j, line_pos, src_len, incr;
11720 Py_UCS4 ch;
11721 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011722 const void *src_data;
11723 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011724 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011725 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726
Antoine Pitrou22425222011-10-04 19:10:51 +020011727 if (PyUnicode_READY(self) == -1)
11728 return NULL;
11729
Thomas Wouters7e474022000-07-16 12:04:32 +000011730 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011731 src_len = PyUnicode_GET_LENGTH(self);
11732 i = j = line_pos = 0;
11733 kind = PyUnicode_KIND(self);
11734 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011735 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011736 for (; i < src_len; i++) {
11737 ch = PyUnicode_READ(kind, src_data, i);
11738 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011739 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011740 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011741 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011742 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011743 goto overflow;
11744 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011745 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011746 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011747 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011748 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011749 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011750 goto overflow;
11751 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011753 if (ch == '\n' || ch == '\r')
11754 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011755 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011756 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011757 if (!found)
11758 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011759
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011761 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762 if (!u)
11763 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011764 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765
Antoine Pitroue71d5742011-10-04 15:55:09 +020011766 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767
Antoine Pitroue71d5742011-10-04 15:55:09 +020011768 for (; i < src_len; i++) {
11769 ch = PyUnicode_READ(kind, src_data, i);
11770 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011771 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011772 incr = tabsize - (line_pos % tabsize);
11773 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011774 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011775 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011776 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011777 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011778 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011779 line_pos++;
11780 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011781 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011782 if (ch == '\n' || ch == '\r')
11783 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011785 }
11786 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011787 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011788
Antoine Pitroue71d5742011-10-04 15:55:09 +020011789 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011790 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11791 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792}
11793
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011794PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011795 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796\n\
11797Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011798such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799arguments start and end are interpreted as in slice notation.\n\
11800\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011801Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802
11803static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011804unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011806 /* initialize variables to prevent gcc warning */
11807 PyObject *substring = NULL;
11808 Py_ssize_t start = 0;
11809 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011810 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011812 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011814
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011815 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011816 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011817
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011818 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011819
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011820 if (result == -2)
11821 return NULL;
11822
Christian Heimes217cfd12007-12-02 14:31:20 +000011823 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824}
11825
11826static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011827unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011829 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011830 enum PyUnicode_Kind kind;
11831 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011832
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011833 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011834 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011835 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011836 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011837 if (PyUnicode_READY(self) == -1) {
11838 return NULL;
11839 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011840 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11841 PyErr_SetString(PyExc_IndexError, "string index out of range");
11842 return NULL;
11843 }
11844 kind = PyUnicode_KIND(self);
11845 data = PyUnicode_DATA(self);
11846 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011847 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011848}
11849
Guido van Rossumc2504932007-09-18 19:42:40 +000011850/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011851 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011852static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011853unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011855 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011856
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011857#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011858 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011859#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011860 if (_PyUnicode_HASH(self) != -1)
11861 return _PyUnicode_HASH(self);
11862 if (PyUnicode_READY(self) == -1)
11863 return -1;
animalizea1d14252019-01-02 20:16:06 +080011864
Christian Heimes985ecdc2013-11-20 11:46:18 +010011865 x = _Py_HashBytes(PyUnicode_DATA(self),
11866 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011867 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011868 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011869}
11870
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011871PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011872 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011873\n\
oldkaa0735f2018-02-02 16:52:55 +080011874Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011875such that sub is contained within S[start:end]. Optional\n\
11876arguments start and end are interpreted as in slice notation.\n\
11877\n\
11878Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011879
11880static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011881unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011882{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011883 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011884 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011885 PyObject *substring = NULL;
11886 Py_ssize_t start = 0;
11887 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011889 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011892 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011893 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011894
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011895 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011896
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011897 if (result == -2)
11898 return NULL;
11899
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900 if (result < 0) {
11901 PyErr_SetString(PyExc_ValueError, "substring not found");
11902 return NULL;
11903 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011904
Christian Heimes217cfd12007-12-02 14:31:20 +000011905 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906}
11907
INADA Naoki3ae20562017-01-16 20:41:20 +090011908/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011909str.isascii as unicode_isascii
11910
11911Return True if all characters in the string are ASCII, False otherwise.
11912
11913ASCII characters have code points in the range U+0000-U+007F.
11914Empty string is ASCII too.
11915[clinic start generated code]*/
11916
11917static PyObject *
11918unicode_isascii_impl(PyObject *self)
11919/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11920{
11921 if (PyUnicode_READY(self) == -1) {
11922 return NULL;
11923 }
11924 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11925}
11926
11927/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011928str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929
INADA Naoki3ae20562017-01-16 20:41:20 +090011930Return True if the string is a lowercase string, False otherwise.
11931
11932A string is lowercase if all cased characters in the string are lowercase and
11933there is at least one cased character in the string.
11934[clinic start generated code]*/
11935
11936static PyObject *
11937unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011938/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011939{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011940 Py_ssize_t i, length;
11941 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011942 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943 int cased;
11944
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011945 if (PyUnicode_READY(self) == -1)
11946 return NULL;
11947 length = PyUnicode_GET_LENGTH(self);
11948 kind = PyUnicode_KIND(self);
11949 data = PyUnicode_DATA(self);
11950
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011952 if (length == 1)
11953 return PyBool_FromLong(
11954 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011956 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011957 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011958 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011959
Guido van Rossumd57fd912000-03-10 22:53:23 +000011960 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011961 for (i = 0; i < length; i++) {
11962 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011963
Benjamin Peterson29060642009-01-31 22:14:21 +000011964 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011965 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011966 else if (!cased && Py_UNICODE_ISLOWER(ch))
11967 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011969 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011970}
11971
INADA Naoki3ae20562017-01-16 20:41:20 +090011972/*[clinic input]
11973str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974
INADA Naoki3ae20562017-01-16 20:41:20 +090011975Return True if the string is an uppercase string, False otherwise.
11976
11977A string is uppercase if all cased characters in the string are uppercase and
11978there is at least one cased character in the string.
11979[clinic start generated code]*/
11980
11981static PyObject *
11982unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011983/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011984{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011985 Py_ssize_t i, length;
11986 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011987 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988 int cased;
11989
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990 if (PyUnicode_READY(self) == -1)
11991 return NULL;
11992 length = PyUnicode_GET_LENGTH(self);
11993 kind = PyUnicode_KIND(self);
11994 data = PyUnicode_DATA(self);
11995
Guido van Rossumd57fd912000-03-10 22:53:23 +000011996 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 if (length == 1)
11998 return PyBool_FromLong(
11999 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012000
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012001 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012003 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012004
Guido van Rossumd57fd912000-03-10 22:53:23 +000012005 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012006 for (i = 0; i < length; i++) {
12007 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012008
Benjamin Peterson29060642009-01-31 22:14:21 +000012009 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012010 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012011 else if (!cased && Py_UNICODE_ISUPPER(ch))
12012 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012013 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012014 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012015}
12016
INADA Naoki3ae20562017-01-16 20:41:20 +090012017/*[clinic input]
12018str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000012019
INADA Naoki3ae20562017-01-16 20:41:20 +090012020Return True if the string is a title-cased string, False otherwise.
12021
12022In a title-cased string, upper- and title-case characters may only
12023follow uncased characters and lowercase characters only cased ones.
12024[clinic start generated code]*/
12025
12026static PyObject *
12027unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012028/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012030 Py_ssize_t i, length;
12031 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012032 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012033 int cased, previous_is_cased;
12034
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 if (PyUnicode_READY(self) == -1)
12036 return NULL;
12037 length = PyUnicode_GET_LENGTH(self);
12038 kind = PyUnicode_KIND(self);
12039 data = PyUnicode_DATA(self);
12040
Guido van Rossumd57fd912000-03-10 22:53:23 +000012041 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012042 if (length == 1) {
12043 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12044 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12045 (Py_UNICODE_ISUPPER(ch) != 0));
12046 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012048 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012049 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012050 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012051
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052 cased = 0;
12053 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012054 for (i = 0; i < length; i++) {
12055 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012056
Benjamin Peterson29060642009-01-31 22:14:21 +000012057 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12058 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012059 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012060 previous_is_cased = 1;
12061 cased = 1;
12062 }
12063 else if (Py_UNICODE_ISLOWER(ch)) {
12064 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012065 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012066 previous_is_cased = 1;
12067 cased = 1;
12068 }
12069 else
12070 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012071 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012072 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012073}
12074
INADA Naoki3ae20562017-01-16 20:41:20 +090012075/*[clinic input]
12076str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077
INADA Naoki3ae20562017-01-16 20:41:20 +090012078Return True if the string is a whitespace string, False otherwise.
12079
12080A string is whitespace if all characters in the string are whitespace and there
12081is at least one character in the string.
12082[clinic start generated code]*/
12083
12084static PyObject *
12085unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012086/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012088 Py_ssize_t i, length;
12089 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012090 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012091
12092 if (PyUnicode_READY(self) == -1)
12093 return NULL;
12094 length = PyUnicode_GET_LENGTH(self);
12095 kind = PyUnicode_KIND(self);
12096 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012099 if (length == 1)
12100 return PyBool_FromLong(
12101 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012103 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012104 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012105 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012107 for (i = 0; i < length; i++) {
12108 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012109 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012110 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012111 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012112 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012113}
12114
INADA Naoki3ae20562017-01-16 20:41:20 +090012115/*[clinic input]
12116str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012117
INADA Naoki3ae20562017-01-16 20:41:20 +090012118Return True if the string is an alphabetic string, False otherwise.
12119
12120A string is alphabetic if all characters in the string are alphabetic and there
12121is at least one character in the string.
12122[clinic start generated code]*/
12123
12124static PyObject *
12125unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012126/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012127{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012128 Py_ssize_t i, length;
12129 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012130 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012131
12132 if (PyUnicode_READY(self) == -1)
12133 return NULL;
12134 length = PyUnicode_GET_LENGTH(self);
12135 kind = PyUnicode_KIND(self);
12136 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012137
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012138 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012139 if (length == 1)
12140 return PyBool_FromLong(
12141 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012142
12143 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012144 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012145 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012147 for (i = 0; i < length; i++) {
12148 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012149 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012150 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012151 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012152}
12153
INADA Naoki3ae20562017-01-16 20:41:20 +090012154/*[clinic input]
12155str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012156
INADA Naoki3ae20562017-01-16 20:41:20 +090012157Return True if the string is an alpha-numeric string, False otherwise.
12158
12159A string is alpha-numeric if all characters in the string are alpha-numeric and
12160there is at least one character in the string.
12161[clinic start generated code]*/
12162
12163static PyObject *
12164unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012165/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012166{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012167 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012168 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012169 Py_ssize_t len, i;
12170
12171 if (PyUnicode_READY(self) == -1)
12172 return NULL;
12173
12174 kind = PyUnicode_KIND(self);
12175 data = PyUnicode_DATA(self);
12176 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012177
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012178 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012179 if (len == 1) {
12180 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12181 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12182 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012183
12184 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012185 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012186 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012187
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012188 for (i = 0; i < len; i++) {
12189 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012190 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012191 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012192 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012193 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012194}
12195
INADA Naoki3ae20562017-01-16 20:41:20 +090012196/*[clinic input]
12197str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198
INADA Naoki3ae20562017-01-16 20:41:20 +090012199Return True if the string is a decimal string, False otherwise.
12200
12201A string is a decimal string if all characters in the string are decimal and
12202there is at least one character in the string.
12203[clinic start generated code]*/
12204
12205static PyObject *
12206unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012207/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012209 Py_ssize_t i, length;
12210 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012211 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012212
12213 if (PyUnicode_READY(self) == -1)
12214 return NULL;
12215 length = PyUnicode_GET_LENGTH(self);
12216 kind = PyUnicode_KIND(self);
12217 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012218
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220 if (length == 1)
12221 return PyBool_FromLong(
12222 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012223
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012224 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012225 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012226 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012227
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012228 for (i = 0; i < length; i++) {
12229 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012230 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012231 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012232 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233}
12234
INADA Naoki3ae20562017-01-16 20:41:20 +090012235/*[clinic input]
12236str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237
INADA Naoki3ae20562017-01-16 20:41:20 +090012238Return True if the string is a digit string, False otherwise.
12239
12240A string is a digit string if all characters in the string are digits and there
12241is at least one character in the string.
12242[clinic start generated code]*/
12243
12244static PyObject *
12245unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012246/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012247{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012248 Py_ssize_t i, length;
12249 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012250 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012251
12252 if (PyUnicode_READY(self) == -1)
12253 return NULL;
12254 length = PyUnicode_GET_LENGTH(self);
12255 kind = PyUnicode_KIND(self);
12256 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012257
Guido van Rossumd57fd912000-03-10 22:53:23 +000012258 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012259 if (length == 1) {
12260 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12261 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12262 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012264 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012265 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012266 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012268 for (i = 0; i < length; i++) {
12269 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012270 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012272 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012273}
12274
INADA Naoki3ae20562017-01-16 20:41:20 +090012275/*[clinic input]
12276str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012277
INADA Naoki3ae20562017-01-16 20:41:20 +090012278Return True if the string is a numeric string, False otherwise.
12279
12280A string is numeric if all characters in the string are numeric and there is at
12281least one character in the string.
12282[clinic start generated code]*/
12283
12284static PyObject *
12285unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012286/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012287{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012288 Py_ssize_t i, length;
12289 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012290 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291
12292 if (PyUnicode_READY(self) == -1)
12293 return NULL;
12294 length = PyUnicode_GET_LENGTH(self);
12295 kind = PyUnicode_KIND(self);
12296 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297
Guido van Rossumd57fd912000-03-10 22:53:23 +000012298 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012299 if (length == 1)
12300 return PyBool_FromLong(
12301 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012302
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012303 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012304 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012305 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012306
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012307 for (i = 0; i < length; i++) {
12308 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012309 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012310 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012311 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012312}
12313
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012314Py_ssize_t
12315_PyUnicode_ScanIdentifier(PyObject *self)
Martin v. Löwis47383402007-08-15 07:32:56 +000012316{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012317 Py_ssize_t i;
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012318 if (PyUnicode_READY(self) == -1)
12319 return -1;
Martin v. Löwis47383402007-08-15 07:32:56 +000012320
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012321 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012322 if (len == 0) {
12323 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012324 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012325 }
12326
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012327 int kind = PyUnicode_KIND(self);
12328 const void *data = PyUnicode_DATA(self);
12329 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Martin v. Löwis47383402007-08-15 07:32:56 +000012330 /* PEP 3131 says that the first character must be in
12331 XID_Start and subsequent characters in XID_Continue,
12332 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012333 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012334 letters, digits, underscore). However, given the current
12335 definition of XID_Start and XID_Continue, it is sufficient
12336 to check just for these, except that _ must be allowed
12337 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012338 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012339 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012340 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012341
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012342 for (i = 1; i < len; i++) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012343 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012344 if (!_PyUnicode_IsXidContinue(ch)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012345 return i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012346 }
12347 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012348 return i;
12349}
12350
12351int
12352PyUnicode_IsIdentifier(PyObject *self)
12353{
12354 if (PyUnicode_IS_READY(self)) {
12355 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12356 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12357 /* an empty string is not a valid identifier */
12358 return len && i == len;
12359 }
12360 else {
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012361 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012362 if (len == 0) {
12363 /* an empty string is not a valid identifier */
12364 return 0;
12365 }
12366
12367 const wchar_t *wstr = _PyUnicode_WSTR(self);
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012368 Py_UCS4 ch = wstr[i++];
12369#if SIZEOF_WCHAR_T == 2
12370 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12371 && i < len
12372 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12373 {
12374 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12375 i++;
12376 }
12377#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012378 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12379 return 0;
12380 }
12381
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012382 while (i < len) {
12383 ch = wstr[i++];
12384#if SIZEOF_WCHAR_T == 2
12385 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12386 && i < len
12387 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12388 {
12389 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12390 i++;
12391 }
12392#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012393 if (!_PyUnicode_IsXidContinue(ch)) {
12394 return 0;
12395 }
12396 }
12397 return 1;
12398 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012399}
12400
INADA Naoki3ae20562017-01-16 20:41:20 +090012401/*[clinic input]
12402str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012403
INADA Naoki3ae20562017-01-16 20:41:20 +090012404Return True if the string is a valid Python identifier, False otherwise.
12405
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012406Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012407such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012408[clinic start generated code]*/
12409
12410static PyObject *
12411unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012412/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012413{
12414 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12415}
12416
INADA Naoki3ae20562017-01-16 20:41:20 +090012417/*[clinic input]
12418str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012419
INADA Naoki3ae20562017-01-16 20:41:20 +090012420Return True if the string is printable, False otherwise.
12421
12422A string is printable if all of its characters are considered printable in
12423repr() or if it is empty.
12424[clinic start generated code]*/
12425
12426static PyObject *
12427unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012428/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012429{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012430 Py_ssize_t i, length;
12431 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012432 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012433
12434 if (PyUnicode_READY(self) == -1)
12435 return NULL;
12436 length = PyUnicode_GET_LENGTH(self);
12437 kind = PyUnicode_KIND(self);
12438 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012439
12440 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012441 if (length == 1)
12442 return PyBool_FromLong(
12443 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012444
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012445 for (i = 0; i < length; i++) {
12446 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012447 Py_RETURN_FALSE;
12448 }
12449 }
12450 Py_RETURN_TRUE;
12451}
12452
INADA Naoki3ae20562017-01-16 20:41:20 +090012453/*[clinic input]
12454str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012455
INADA Naoki3ae20562017-01-16 20:41:20 +090012456 iterable: object
12457 /
12458
12459Concatenate any number of strings.
12460
Martin Panter91a88662017-01-24 00:30:06 +000012461The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012462The result is returned as a new string.
12463
12464Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12465[clinic start generated code]*/
12466
12467static PyObject *
12468unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012469/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012470{
INADA Naoki3ae20562017-01-16 20:41:20 +090012471 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012472}
12473
Martin v. Löwis18e16552006-02-15 17:27:45 +000012474static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012475unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012476{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012477 if (PyUnicode_READY(self) == -1)
12478 return -1;
12479 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012480}
12481
INADA Naoki3ae20562017-01-16 20:41:20 +090012482/*[clinic input]
12483str.ljust as unicode_ljust
12484
12485 width: Py_ssize_t
12486 fillchar: Py_UCS4 = ' '
12487 /
12488
12489Return a left-justified string of length width.
12490
12491Padding is done using the specified fill character (default is a space).
12492[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012493
12494static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012495unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12496/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012497{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012498 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012499 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012500
Victor Stinnerc4b49542011-12-11 22:44:26 +010012501 if (PyUnicode_GET_LENGTH(self) >= width)
12502 return unicode_result_unchanged(self);
12503
12504 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012505}
12506
INADA Naoki3ae20562017-01-16 20:41:20 +090012507/*[clinic input]
12508str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012509
INADA Naoki3ae20562017-01-16 20:41:20 +090012510Return a copy of the string converted to lowercase.
12511[clinic start generated code]*/
12512
12513static PyObject *
12514unicode_lower_impl(PyObject *self)
12515/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012516{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012517 if (PyUnicode_READY(self) == -1)
12518 return NULL;
12519 if (PyUnicode_IS_ASCII(self))
12520 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012521 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522}
12523
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012524#define LEFTSTRIP 0
12525#define RIGHTSTRIP 1
12526#define BOTHSTRIP 2
12527
12528/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012529static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012530
INADA Naoki3ae20562017-01-16 20:41:20 +090012531#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012532
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012533/* externally visible for str.strip(unicode) */
12534PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012535_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012536{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012537 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012538 int kind;
12539 Py_ssize_t i, j, len;
12540 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012541 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012543 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12544 return NULL;
12545
12546 kind = PyUnicode_KIND(self);
12547 data = PyUnicode_DATA(self);
12548 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012549 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12551 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012552 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012553
Benjamin Peterson14339b62009-01-31 16:36:08 +000012554 i = 0;
12555 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012556 while (i < len) {
12557 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12558 if (!BLOOM(sepmask, ch))
12559 break;
12560 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12561 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012562 i++;
12563 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012564 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012565
Benjamin Peterson14339b62009-01-31 16:36:08 +000012566 j = len;
12567 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012568 j--;
12569 while (j >= i) {
12570 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12571 if (!BLOOM(sepmask, ch))
12572 break;
12573 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12574 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012575 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012576 }
12577
Benjamin Peterson29060642009-01-31 22:14:21 +000012578 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012579 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012580
Victor Stinner7931d9a2011-11-04 00:22:48 +010012581 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012582}
12583
12584PyObject*
12585PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12586{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012587 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012588 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012589 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012590
Victor Stinnerde636f32011-10-01 03:55:54 +020012591 if (PyUnicode_READY(self) == -1)
12592 return NULL;
12593
Victor Stinner684d5fd2012-05-03 02:32:34 +020012594 length = PyUnicode_GET_LENGTH(self);
12595 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012596
Victor Stinner684d5fd2012-05-03 02:32:34 +020012597 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012598 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012599
Victor Stinnerde636f32011-10-01 03:55:54 +020012600 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012601 PyErr_SetString(PyExc_IndexError, "string index out of range");
12602 return NULL;
12603 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012604 if (start >= length || end < start)
12605 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012606
Victor Stinner684d5fd2012-05-03 02:32:34 +020012607 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012608 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012609 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012610 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012611 }
12612 else {
12613 kind = PyUnicode_KIND(self);
12614 data = PyUnicode_1BYTE_DATA(self);
12615 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012616 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012617 length);
12618 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012619}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012620
12621static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012622do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012623{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012624 Py_ssize_t len, i, j;
12625
12626 if (PyUnicode_READY(self) == -1)
12627 return NULL;
12628
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012629 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012630
Victor Stinnercc7af722013-04-09 22:39:24 +020012631 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012632 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012633
12634 i = 0;
12635 if (striptype != RIGHTSTRIP) {
12636 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012637 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012638 if (!_Py_ascii_whitespace[ch])
12639 break;
12640 i++;
12641 }
12642 }
12643
12644 j = len;
12645 if (striptype != LEFTSTRIP) {
12646 j--;
12647 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012648 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012649 if (!_Py_ascii_whitespace[ch])
12650 break;
12651 j--;
12652 }
12653 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012654 }
12655 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012656 else {
12657 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012658 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012659
Victor Stinnercc7af722013-04-09 22:39:24 +020012660 i = 0;
12661 if (striptype != RIGHTSTRIP) {
12662 while (i < len) {
12663 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12664 if (!Py_UNICODE_ISSPACE(ch))
12665 break;
12666 i++;
12667 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012668 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012669
12670 j = len;
12671 if (striptype != LEFTSTRIP) {
12672 j--;
12673 while (j >= i) {
12674 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12675 if (!Py_UNICODE_ISSPACE(ch))
12676 break;
12677 j--;
12678 }
12679 j++;
12680 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012681 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012682
Victor Stinner7931d9a2011-11-04 00:22:48 +010012683 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684}
12685
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012686
12687static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012688do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012689{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012690 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012691 if (PyUnicode_Check(sep))
12692 return _PyUnicode_XStrip(self, striptype, sep);
12693 else {
12694 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012695 "%s arg must be None or str",
12696 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012697 return NULL;
12698 }
12699 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012700
Benjamin Peterson14339b62009-01-31 16:36:08 +000012701 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012702}
12703
12704
INADA Naoki3ae20562017-01-16 20:41:20 +090012705/*[clinic input]
12706str.strip as unicode_strip
12707
12708 chars: object = None
12709 /
12710
Zachary Ware09895c22019-10-09 16:09:00 -050012711Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012712
12713If chars is given and not None, remove characters in chars instead.
12714[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012715
12716static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012717unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012718/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012719{
INADA Naoki3ae20562017-01-16 20:41:20 +090012720 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012721}
12722
12723
INADA Naoki3ae20562017-01-16 20:41:20 +090012724/*[clinic input]
12725str.lstrip as unicode_lstrip
12726
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012727 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012728 /
12729
12730Return a copy of the string with leading whitespace removed.
12731
12732If chars is given and not None, remove characters in chars instead.
12733[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012734
12735static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012736unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012737/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012738{
INADA Naoki3ae20562017-01-16 20:41:20 +090012739 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012740}
12741
12742
INADA Naoki3ae20562017-01-16 20:41:20 +090012743/*[clinic input]
12744str.rstrip as unicode_rstrip
12745
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012746 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012747 /
12748
12749Return a copy of the string with trailing whitespace removed.
12750
12751If chars is given and not None, remove characters in chars instead.
12752[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012753
12754static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012755unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012756/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012757{
INADA Naoki3ae20562017-01-16 20:41:20 +090012758 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012759}
12760
12761
Guido van Rossumd57fd912000-03-10 22:53:23 +000012762static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012763unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012764{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012765 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012766 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012767
Serhiy Storchaka05997252013-01-26 12:14:02 +020012768 if (len < 1)
12769 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012770
Victor Stinnerc4b49542011-12-11 22:44:26 +010012771 /* no repeat, return original string */
12772 if (len == 1)
12773 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012774
Benjamin Petersonbac79492012-01-14 13:34:47 -050012775 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012776 return NULL;
12777
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012778 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012779 PyErr_SetString(PyExc_OverflowError,
12780 "repeated string is too long");
12781 return NULL;
12782 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012783 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012784
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012785 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012786 if (!u)
12787 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012788 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012790 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012791 int kind = PyUnicode_KIND(str);
12792 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012793 if (kind == PyUnicode_1BYTE_KIND) {
12794 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012795 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012796 }
12797 else if (kind == PyUnicode_2BYTE_KIND) {
12798 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012799 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012800 ucs2[n] = fill_char;
12801 } else {
12802 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12803 assert(kind == PyUnicode_4BYTE_KIND);
12804 for (n = 0; n < len; ++n)
12805 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012806 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012807 }
12808 else {
12809 /* number of characters copied this far */
12810 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012811 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012812 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012813 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012814 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012815 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012816 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012817 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012818 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012819 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012820 }
12821
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012822 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012823 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012824}
12825
Alexander Belopolsky40018472011-02-26 01:02:56 +000012826PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012827PyUnicode_Replace(PyObject *str,
12828 PyObject *substr,
12829 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012830 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012831{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012832 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12833 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012834 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012835 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012836}
12837
INADA Naoki3ae20562017-01-16 20:41:20 +090012838/*[clinic input]
12839str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012840
INADA Naoki3ae20562017-01-16 20:41:20 +090012841 old: unicode
12842 new: unicode
12843 count: Py_ssize_t = -1
12844 Maximum number of occurrences to replace.
12845 -1 (the default value) means replace all occurrences.
12846 /
12847
12848Return a copy with all occurrences of substring old replaced by new.
12849
12850If the optional argument count is given, only the first count occurrences are
12851replaced.
12852[clinic start generated code]*/
12853
12854static PyObject *
12855unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12856 Py_ssize_t count)
12857/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012858{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012859 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012860 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012861 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012862}
12863
sweeneydea81849b2020-04-22 17:05:48 -040012864/*[clinic input]
12865str.removeprefix as unicode_removeprefix
12866
12867 prefix: unicode
12868 /
12869
12870Return a str with the given prefix string removed if present.
12871
12872If the string starts with the prefix string, return string[len(prefix):].
12873Otherwise, return a copy of the original string.
12874[clinic start generated code]*/
12875
12876static PyObject *
12877unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12878/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
12879{
12880 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12881 if (match == -1) {
12882 return NULL;
12883 }
12884 if (match) {
12885 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12886 PyUnicode_GET_LENGTH(self));
12887 }
12888 return unicode_result_unchanged(self);
12889}
12890
12891/*[clinic input]
12892str.removesuffix as unicode_removesuffix
12893
12894 suffix: unicode
12895 /
12896
12897Return a str with the given suffix string removed if present.
12898
12899If the string ends with the suffix string and that suffix is not empty,
12900return string[:-len(suffix)]. Otherwise, return a copy of the original
12901string.
12902[clinic start generated code]*/
12903
12904static PyObject *
12905unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12906/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12907{
12908 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12909 if (match == -1) {
12910 return NULL;
12911 }
12912 if (match) {
12913 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12914 - PyUnicode_GET_LENGTH(suffix));
12915 }
12916 return unicode_result_unchanged(self);
12917}
12918
Alexander Belopolsky40018472011-02-26 01:02:56 +000012919static PyObject *
12920unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012921{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012922 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012923 Py_ssize_t isize;
12924 Py_ssize_t osize, squote, dquote, i, o;
12925 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012926 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012927 const void *idata;
12928 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012929
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012930 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012931 return NULL;
12932
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012933 isize = PyUnicode_GET_LENGTH(unicode);
12934 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012936 /* Compute length of output, quote characters, and
12937 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012938 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012939 max = 127;
12940 squote = dquote = 0;
12941 ikind = PyUnicode_KIND(unicode);
12942 for (i = 0; i < isize; i++) {
12943 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012944 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012945 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012946 case '\'': squote++; break;
12947 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012948 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012949 incr = 2;
12950 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012951 default:
12952 /* Fast-path ASCII */
12953 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012954 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012955 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012956 ;
12957 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012958 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012959 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012960 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012961 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012962 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012963 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012964 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012965 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012966 if (osize > PY_SSIZE_T_MAX - incr) {
12967 PyErr_SetString(PyExc_OverflowError,
12968 "string is too long to generate repr");
12969 return NULL;
12970 }
12971 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012972 }
12973
12974 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012975 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012976 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012977 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012978 if (dquote)
12979 /* Both squote and dquote present. Use squote,
12980 and escape them */
12981 osize += squote;
12982 else
12983 quote = '"';
12984 }
Victor Stinner55c08782013-04-14 18:45:39 +020012985 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012986
12987 repr = PyUnicode_New(osize, max);
12988 if (repr == NULL)
12989 return NULL;
12990 okind = PyUnicode_KIND(repr);
12991 odata = PyUnicode_DATA(repr);
12992
12993 PyUnicode_WRITE(okind, odata, 0, quote);
12994 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012995 if (unchanged) {
12996 _PyUnicode_FastCopyCharacters(repr, 1,
12997 unicode, 0,
12998 isize);
12999 }
13000 else {
13001 for (i = 0, o = 1; i < isize; i++) {
13002 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013003
Victor Stinner55c08782013-04-14 18:45:39 +020013004 /* Escape quotes and backslashes */
13005 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000013006 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013007 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020013008 continue;
13009 }
13010
13011 /* Map special whitespace to '\t', \n', '\r' */
13012 if (ch == '\t') {
13013 PyUnicode_WRITE(okind, odata, o++, '\\');
13014 PyUnicode_WRITE(okind, odata, o++, 't');
13015 }
13016 else if (ch == '\n') {
13017 PyUnicode_WRITE(okind, odata, o++, '\\');
13018 PyUnicode_WRITE(okind, odata, o++, 'n');
13019 }
13020 else if (ch == '\r') {
13021 PyUnicode_WRITE(okind, odata, o++, '\\');
13022 PyUnicode_WRITE(okind, odata, o++, 'r');
13023 }
13024
13025 /* Map non-printable US ASCII to '\xhh' */
13026 else if (ch < ' ' || ch == 0x7F) {
13027 PyUnicode_WRITE(okind, odata, o++, '\\');
13028 PyUnicode_WRITE(okind, odata, o++, 'x');
13029 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13030 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13031 }
13032
13033 /* Copy ASCII characters as-is */
13034 else if (ch < 0x7F) {
13035 PyUnicode_WRITE(okind, odata, o++, ch);
13036 }
13037
13038 /* Non-ASCII characters */
13039 else {
13040 /* Map Unicode whitespace and control characters
13041 (categories Z* and C* except ASCII space)
13042 */
13043 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13044 PyUnicode_WRITE(okind, odata, o++, '\\');
13045 /* Map 8-bit characters to '\xhh' */
13046 if (ch <= 0xff) {
13047 PyUnicode_WRITE(okind, odata, o++, 'x');
13048 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13049 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13050 }
13051 /* Map 16-bit characters to '\uxxxx' */
13052 else if (ch <= 0xffff) {
13053 PyUnicode_WRITE(okind, odata, o++, 'u');
13054 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13055 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13056 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13057 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13058 }
13059 /* Map 21-bit characters to '\U00xxxxxx' */
13060 else {
13061 PyUnicode_WRITE(okind, odata, o++, 'U');
13062 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13063 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13064 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13065 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13066 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13067 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13068 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13069 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13070 }
13071 }
13072 /* Copy characters as-is */
13073 else {
13074 PyUnicode_WRITE(okind, odata, o++, ch);
13075 }
Georg Brandl559e5d72008-06-11 18:37:52 +000013076 }
13077 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000013078 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013079 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020013080 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000013081 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013082}
13083
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013084PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013085 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013086\n\
13087Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080013088such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013089arguments start and end are interpreted as in slice notation.\n\
13090\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013091Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013092
13093static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013094unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013095{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013096 /* initialize variables to prevent gcc warning */
13097 PyObject *substring = NULL;
13098 Py_ssize_t start = 0;
13099 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013100 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013101
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013102 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013103 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013104
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013105 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013106 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013107
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013108 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013109
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013110 if (result == -2)
13111 return NULL;
13112
Christian Heimes217cfd12007-12-02 14:31:20 +000013113 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013114}
13115
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013116PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013117 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013118\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070013119Return the highest index in S where substring sub is found,\n\
13120such that sub is contained within S[start:end]. Optional\n\
13121arguments start and end are interpreted as in slice notation.\n\
13122\n\
13123Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124
13125static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013126unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013127{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013128 /* initialize variables to prevent gcc warning */
13129 PyObject *substring = NULL;
13130 Py_ssize_t start = 0;
13131 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013132 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013134 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013135 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013137 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013138 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013139
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013140 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013142 if (result == -2)
13143 return NULL;
13144
Guido van Rossumd57fd912000-03-10 22:53:23 +000013145 if (result < 0) {
13146 PyErr_SetString(PyExc_ValueError, "substring not found");
13147 return NULL;
13148 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013149
Christian Heimes217cfd12007-12-02 14:31:20 +000013150 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013151}
13152
INADA Naoki3ae20562017-01-16 20:41:20 +090013153/*[clinic input]
13154str.rjust as unicode_rjust
13155
13156 width: Py_ssize_t
13157 fillchar: Py_UCS4 = ' '
13158 /
13159
13160Return a right-justified string of length width.
13161
13162Padding is done using the specified fill character (default is a space).
13163[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013164
13165static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013166unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13167/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013168{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013169 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013170 return NULL;
13171
Victor Stinnerc4b49542011-12-11 22:44:26 +010013172 if (PyUnicode_GET_LENGTH(self) >= width)
13173 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013174
Victor Stinnerc4b49542011-12-11 22:44:26 +010013175 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013176}
13177
Alexander Belopolsky40018472011-02-26 01:02:56 +000013178PyObject *
13179PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013180{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013181 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013182 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013183
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013184 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013185}
13186
INADA Naoki3ae20562017-01-16 20:41:20 +090013187/*[clinic input]
13188str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013189
INADA Naoki3ae20562017-01-16 20:41:20 +090013190 sep: object = None
13191 The delimiter according which to split the string.
13192 None (the default value) means split according to any whitespace,
13193 and discard empty strings from the result.
13194 maxsplit: Py_ssize_t = -1
13195 Maximum number of splits to do.
13196 -1 (the default value) means no limit.
13197
13198Return a list of the words in the string, using sep as the delimiter string.
13199[clinic start generated code]*/
13200
13201static PyObject *
13202unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13203/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013204{
INADA Naoki3ae20562017-01-16 20:41:20 +090013205 if (sep == Py_None)
13206 return split(self, NULL, maxsplit);
13207 if (PyUnicode_Check(sep))
13208 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013209
Victor Stinner998b8062018-09-12 00:23:25 +020013210 PyErr_Format(PyExc_TypeError,
13211 "must be str or None, not %.100s",
13212 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013213 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013214}
13215
Thomas Wouters477c8d52006-05-27 19:21:47 +000013216PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013217PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013218{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013219 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013220 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013221 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013222 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013223
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013224 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013225 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013226
Victor Stinner14f8f022011-10-05 20:58:25 +020013227 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013228 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013229 len1 = PyUnicode_GET_LENGTH(str_obj);
13230 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013231 if (kind1 < kind2 || len1 < len2) {
13232 _Py_INCREF_UNICODE_EMPTY();
13233 if (!unicode_empty)
13234 out = NULL;
13235 else {
13236 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
13237 Py_DECREF(unicode_empty);
13238 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013239 return out;
13240 }
13241 buf1 = PyUnicode_DATA(str_obj);
13242 buf2 = PyUnicode_DATA(sep_obj);
13243 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013244 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013245 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013246 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013248
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013249 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013250 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013251 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13252 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13253 else
13254 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013255 break;
13256 case PyUnicode_2BYTE_KIND:
13257 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13258 break;
13259 case PyUnicode_4BYTE_KIND:
13260 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13261 break;
13262 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013263 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013264 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013265
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013266 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013267 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013268 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013269
13270 return out;
13271}
13272
13273
13274PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013275PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013276{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013277 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013278 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013279 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013280 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013281
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013282 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013283 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013284
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013285 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013286 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013287 len1 = PyUnicode_GET_LENGTH(str_obj);
13288 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013289 if (kind1 < kind2 || len1 < len2) {
13290 _Py_INCREF_UNICODE_EMPTY();
13291 if (!unicode_empty)
13292 out = NULL;
13293 else {
13294 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13295 Py_DECREF(unicode_empty);
13296 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013297 return out;
13298 }
13299 buf1 = PyUnicode_DATA(str_obj);
13300 buf2 = PyUnicode_DATA(sep_obj);
13301 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013302 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013303 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013304 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013305 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013306
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013307 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013308 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013309 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13310 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13311 else
13312 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013313 break;
13314 case PyUnicode_2BYTE_KIND:
13315 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13316 break;
13317 case PyUnicode_4BYTE_KIND:
13318 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13319 break;
13320 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013321 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013322 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013323
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013324 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013325 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013326 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013327
13328 return out;
13329}
13330
INADA Naoki3ae20562017-01-16 20:41:20 +090013331/*[clinic input]
13332str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013333
INADA Naoki3ae20562017-01-16 20:41:20 +090013334 sep: object
13335 /
13336
13337Partition the string into three parts using the given separator.
13338
13339This will search for the separator in the string. If the separator is found,
13340returns a 3-tuple containing the part before the separator, the separator
13341itself, and the part after it.
13342
13343If the separator is not found, returns a 3-tuple containing the original string
13344and two empty strings.
13345[clinic start generated code]*/
13346
13347static PyObject *
13348unicode_partition(PyObject *self, PyObject *sep)
13349/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013350{
INADA Naoki3ae20562017-01-16 20:41:20 +090013351 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013352}
13353
INADA Naoki3ae20562017-01-16 20:41:20 +090013354/*[clinic input]
13355str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013356
INADA Naoki3ae20562017-01-16 20:41:20 +090013357Partition the string into three parts using the given separator.
13358
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013359This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013360the separator is found, returns a 3-tuple containing the part before the
13361separator, the separator itself, and the part after it.
13362
13363If the separator is not found, returns a 3-tuple containing two empty strings
13364and the original string.
13365[clinic start generated code]*/
13366
13367static PyObject *
13368unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013369/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013370{
INADA Naoki3ae20562017-01-16 20:41:20 +090013371 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013372}
13373
Alexander Belopolsky40018472011-02-26 01:02:56 +000013374PyObject *
13375PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013376{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013377 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013378 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013379
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013380 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013381}
13382
INADA Naoki3ae20562017-01-16 20:41:20 +090013383/*[clinic input]
13384str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013385
INADA Naoki3ae20562017-01-16 20:41:20 +090013386Return a list of the words in the string, using sep as the delimiter string.
13387
13388Splits are done starting at the end of the string and working to the front.
13389[clinic start generated code]*/
13390
13391static PyObject *
13392unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13393/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013394{
INADA Naoki3ae20562017-01-16 20:41:20 +090013395 if (sep == Py_None)
13396 return rsplit(self, NULL, maxsplit);
13397 if (PyUnicode_Check(sep))
13398 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013399
Victor Stinner998b8062018-09-12 00:23:25 +020013400 PyErr_Format(PyExc_TypeError,
13401 "must be str or None, not %.100s",
13402 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013403 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013404}
13405
INADA Naoki3ae20562017-01-16 20:41:20 +090013406/*[clinic input]
13407str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013408
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013409 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013410
13411Return a list of the lines in the string, breaking at line boundaries.
13412
13413Line breaks are not included in the resulting list unless keepends is given and
13414true.
13415[clinic start generated code]*/
13416
13417static PyObject *
13418unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013419/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013420{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013421 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013422}
13423
13424static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013425PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013426{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013427 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013428}
13429
INADA Naoki3ae20562017-01-16 20:41:20 +090013430/*[clinic input]
13431str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013432
INADA Naoki3ae20562017-01-16 20:41:20 +090013433Convert uppercase characters to lowercase and lowercase characters to uppercase.
13434[clinic start generated code]*/
13435
13436static PyObject *
13437unicode_swapcase_impl(PyObject *self)
13438/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013439{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013440 if (PyUnicode_READY(self) == -1)
13441 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013442 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013443}
13444
Larry Hastings61272b72014-01-07 12:41:53 -080013445/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013446
Larry Hastings31826802013-10-19 00:09:25 -070013447@staticmethod
13448str.maketrans as unicode_maketrans
13449
13450 x: object
13451
13452 y: unicode=NULL
13453
13454 z: unicode=NULL
13455
13456 /
13457
13458Return a translation table usable for str.translate().
13459
13460If there is only one argument, it must be a dictionary mapping Unicode
13461ordinals (integers) or characters to Unicode ordinals, strings or None.
13462Character keys will be then converted to ordinals.
13463If there are two arguments, they must be strings of equal length, and
13464in the resulting dictionary, each character in x will be mapped to the
13465character at the same position in y. If there is a third argument, it
13466must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013467[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013468
Larry Hastings31826802013-10-19 00:09:25 -070013469static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013470unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013471/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013472{
Georg Brandlceee0772007-11-27 23:48:05 +000013473 PyObject *new = NULL, *key, *value;
13474 Py_ssize_t i = 0;
13475 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013476
Georg Brandlceee0772007-11-27 23:48:05 +000013477 new = PyDict_New();
13478 if (!new)
13479 return NULL;
13480 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013481 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013482 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013483
Georg Brandlceee0772007-11-27 23:48:05 +000013484 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013485 if (!PyUnicode_Check(x)) {
13486 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13487 "be a string if there is a second argument");
13488 goto err;
13489 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013490 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013491 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13492 "arguments must have equal length");
13493 goto err;
13494 }
13495 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013496 x_kind = PyUnicode_KIND(x);
13497 y_kind = PyUnicode_KIND(y);
13498 x_data = PyUnicode_DATA(x);
13499 y_data = PyUnicode_DATA(y);
13500 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13501 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013502 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013503 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013504 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013505 if (!value) {
13506 Py_DECREF(key);
13507 goto err;
13508 }
Georg Brandlceee0772007-11-27 23:48:05 +000013509 res = PyDict_SetItem(new, key, value);
13510 Py_DECREF(key);
13511 Py_DECREF(value);
13512 if (res < 0)
13513 goto err;
13514 }
13515 /* create entries for deleting chars in z */
13516 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013517 z_kind = PyUnicode_KIND(z);
13518 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013519 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013520 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013521 if (!key)
13522 goto err;
13523 res = PyDict_SetItem(new, key, Py_None);
13524 Py_DECREF(key);
13525 if (res < 0)
13526 goto err;
13527 }
13528 }
13529 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013530 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013531 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013532
Georg Brandlceee0772007-11-27 23:48:05 +000013533 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013534 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013535 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13536 "to maketrans it must be a dict");
13537 goto err;
13538 }
13539 /* copy entries into the new dict, converting string keys to int keys */
13540 while (PyDict_Next(x, &i, &key, &value)) {
13541 if (PyUnicode_Check(key)) {
13542 /* convert string keys to integer keys */
13543 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013544 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013545 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13546 "table must be of length 1");
13547 goto err;
13548 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013549 kind = PyUnicode_KIND(key);
13550 data = PyUnicode_DATA(key);
13551 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013552 if (!newkey)
13553 goto err;
13554 res = PyDict_SetItem(new, newkey, value);
13555 Py_DECREF(newkey);
13556 if (res < 0)
13557 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013558 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013559 /* just keep integer keys */
13560 if (PyDict_SetItem(new, key, value) < 0)
13561 goto err;
13562 } else {
13563 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13564 "be strings or integers");
13565 goto err;
13566 }
13567 }
13568 }
13569 return new;
13570 err:
13571 Py_DECREF(new);
13572 return NULL;
13573}
13574
INADA Naoki3ae20562017-01-16 20:41:20 +090013575/*[clinic input]
13576str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013577
INADA Naoki3ae20562017-01-16 20:41:20 +090013578 table: object
13579 Translation table, which must be a mapping of Unicode ordinals to
13580 Unicode ordinals, strings, or None.
13581 /
13582
13583Replace each character in the string using the given translation table.
13584
13585The table must implement lookup/indexing via __getitem__, for instance a
13586dictionary or list. If this operation raises LookupError, the character is
13587left untouched. Characters mapped to None are deleted.
13588[clinic start generated code]*/
13589
13590static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013591unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013592/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013593{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013594 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013595}
13596
INADA Naoki3ae20562017-01-16 20:41:20 +090013597/*[clinic input]
13598str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013599
INADA Naoki3ae20562017-01-16 20:41:20 +090013600Return a copy of the string converted to uppercase.
13601[clinic start generated code]*/
13602
13603static PyObject *
13604unicode_upper_impl(PyObject *self)
13605/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013606{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013607 if (PyUnicode_READY(self) == -1)
13608 return NULL;
13609 if (PyUnicode_IS_ASCII(self))
13610 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013611 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013612}
13613
INADA Naoki3ae20562017-01-16 20:41:20 +090013614/*[clinic input]
13615str.zfill as unicode_zfill
13616
13617 width: Py_ssize_t
13618 /
13619
13620Pad a numeric string with zeros on the left, to fill a field of the given width.
13621
13622The string is never truncated.
13623[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013624
13625static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013626unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013627/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013628{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013629 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013630 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013631 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013632 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013633 Py_UCS4 chr;
13634
Benjamin Petersonbac79492012-01-14 13:34:47 -050013635 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013636 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013637
Victor Stinnerc4b49542011-12-11 22:44:26 +010013638 if (PyUnicode_GET_LENGTH(self) >= width)
13639 return unicode_result_unchanged(self);
13640
13641 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013642
13643 u = pad(self, fill, 0, '0');
13644
Walter Dörwald068325e2002-04-15 13:36:47 +000013645 if (u == NULL)
13646 return NULL;
13647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013648 kind = PyUnicode_KIND(u);
13649 data = PyUnicode_DATA(u);
13650 chr = PyUnicode_READ(kind, data, fill);
13651
13652 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013653 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013654 PyUnicode_WRITE(kind, data, 0, chr);
13655 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013656 }
13657
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013658 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013659 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013660}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013661
13662#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013663static PyObject *
13664unicode__decimal2ascii(PyObject *self)
13665{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013666 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013667}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013668#endif
13669
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013670PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013671 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013672\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013673Return True if S starts with the specified prefix, False otherwise.\n\
13674With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013675With optional end, stop comparing S at that position.\n\
13676prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013677
13678static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013679unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013680 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013681{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013682 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013683 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013684 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013685 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013686 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013687
Jesus Ceaac451502011-04-20 17:09:23 +020013688 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013689 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013690 if (PyTuple_Check(subobj)) {
13691 Py_ssize_t i;
13692 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013693 substring = PyTuple_GET_ITEM(subobj, i);
13694 if (!PyUnicode_Check(substring)) {
13695 PyErr_Format(PyExc_TypeError,
13696 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013697 "not %.100s",
13698 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013699 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013700 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013701 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013702 if (result == -1)
13703 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013704 if (result) {
13705 Py_RETURN_TRUE;
13706 }
13707 }
13708 /* nothing matched */
13709 Py_RETURN_FALSE;
13710 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013711 if (!PyUnicode_Check(subobj)) {
13712 PyErr_Format(PyExc_TypeError,
13713 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013714 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013715 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013716 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013717 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013718 if (result == -1)
13719 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013720 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013721}
13722
13723
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013724PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013725 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013726\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013727Return True if S ends with the specified suffix, False otherwise.\n\
13728With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013729With optional end, stop comparing S at that position.\n\
13730suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013731
13732static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013733unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013734 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013735{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013736 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013737 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013738 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013739 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013740 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013741
Jesus Ceaac451502011-04-20 17:09:23 +020013742 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013743 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013744 if (PyTuple_Check(subobj)) {
13745 Py_ssize_t i;
13746 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013747 substring = PyTuple_GET_ITEM(subobj, i);
13748 if (!PyUnicode_Check(substring)) {
13749 PyErr_Format(PyExc_TypeError,
13750 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013751 "not %.100s",
13752 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013753 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013754 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013755 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013756 if (result == -1)
13757 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013758 if (result) {
13759 Py_RETURN_TRUE;
13760 }
13761 }
13762 Py_RETURN_FALSE;
13763 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013764 if (!PyUnicode_Check(subobj)) {
13765 PyErr_Format(PyExc_TypeError,
13766 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013767 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013768 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013769 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013770 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013771 if (result == -1)
13772 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013773 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013774}
13775
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013776static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013777_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013778{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013779 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13780 writer->data = PyUnicode_DATA(writer->buffer);
13781
13782 if (!writer->readonly) {
13783 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013784 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013785 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013786 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013787 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13788 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13789 writer->kind = PyUnicode_WCHAR_KIND;
13790 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13791
Victor Stinner8f674cc2013-04-17 23:02:17 +020013792 /* Copy-on-write mode: set buffer size to 0 so
13793 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13794 * next write. */
13795 writer->size = 0;
13796 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013797}
13798
Victor Stinnerd3f08822012-05-29 12:57:52 +020013799void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013800_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013801{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013802 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013803
13804 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013805 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013806
13807 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13808 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13809 writer->kind = PyUnicode_WCHAR_KIND;
13810 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013811}
13812
Inada Naoki770847a2019-06-24 12:30:24 +090013813// Initialize _PyUnicodeWriter with initial buffer
13814static inline void
13815_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13816{
13817 memset(writer, 0, sizeof(*writer));
13818 writer->buffer = buffer;
13819 _PyUnicodeWriter_Update(writer);
13820 writer->min_length = writer->size;
13821}
13822
Victor Stinnerd3f08822012-05-29 12:57:52 +020013823int
13824_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13825 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013826{
13827 Py_ssize_t newlen;
13828 PyObject *newbuffer;
13829
Victor Stinner2740e462016-09-06 16:58:36 -070013830 assert(maxchar <= MAX_UNICODE);
13831
Victor Stinnerca9381e2015-09-22 00:58:32 +020013832 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013833 assert((maxchar > writer->maxchar && length >= 0)
13834 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013835
Victor Stinner202fdca2012-05-07 12:47:02 +020013836 if (length > PY_SSIZE_T_MAX - writer->pos) {
13837 PyErr_NoMemory();
13838 return -1;
13839 }
13840 newlen = writer->pos + length;
13841
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013842 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013843
Victor Stinnerd3f08822012-05-29 12:57:52 +020013844 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013845 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013846 if (writer->overallocate
13847 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13848 /* overallocate to limit the number of realloc() */
13849 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013850 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013851 if (newlen < writer->min_length)
13852 newlen = writer->min_length;
13853
Victor Stinnerd3f08822012-05-29 12:57:52 +020013854 writer->buffer = PyUnicode_New(newlen, maxchar);
13855 if (writer->buffer == NULL)
13856 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013857 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013858 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013859 if (writer->overallocate
13860 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13861 /* overallocate to limit the number of realloc() */
13862 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013863 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013864 if (newlen < writer->min_length)
13865 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013866
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013867 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013868 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013869 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013870 newbuffer = PyUnicode_New(newlen, maxchar);
13871 if (newbuffer == NULL)
13872 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013873 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13874 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013875 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013876 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013877 }
13878 else {
13879 newbuffer = resize_compact(writer->buffer, newlen);
13880 if (newbuffer == NULL)
13881 return -1;
13882 }
13883 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013884 }
13885 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013886 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013887 newbuffer = PyUnicode_New(writer->size, maxchar);
13888 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013889 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013890 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13891 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013892 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013893 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013894 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013895 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013896
13897#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013898}
13899
Victor Stinnerca9381e2015-09-22 00:58:32 +020013900int
13901_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13902 enum PyUnicode_Kind kind)
13903{
13904 Py_UCS4 maxchar;
13905
13906 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13907 assert(writer->kind < kind);
13908
13909 switch (kind)
13910 {
13911 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13912 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13913 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13914 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013915 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013916 }
13917
13918 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13919}
13920
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013921static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013922_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013923{
Victor Stinner2740e462016-09-06 16:58:36 -070013924 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013925 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13926 return -1;
13927 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13928 writer->pos++;
13929 return 0;
13930}
13931
13932int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013933_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13934{
13935 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13936}
13937
13938int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013939_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13940{
13941 Py_UCS4 maxchar;
13942 Py_ssize_t len;
13943
13944 if (PyUnicode_READY(str) == -1)
13945 return -1;
13946 len = PyUnicode_GET_LENGTH(str);
13947 if (len == 0)
13948 return 0;
13949 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13950 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013951 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013952 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013953 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013954 Py_INCREF(str);
13955 writer->buffer = str;
13956 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013957 writer->pos += len;
13958 return 0;
13959 }
13960 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13961 return -1;
13962 }
13963 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13964 str, 0, len);
13965 writer->pos += len;
13966 return 0;
13967}
13968
Victor Stinnere215d962012-10-06 23:03:36 +020013969int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013970_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13971 Py_ssize_t start, Py_ssize_t end)
13972{
13973 Py_UCS4 maxchar;
13974 Py_ssize_t len;
13975
13976 if (PyUnicode_READY(str) == -1)
13977 return -1;
13978
13979 assert(0 <= start);
13980 assert(end <= PyUnicode_GET_LENGTH(str));
13981 assert(start <= end);
13982
13983 if (end == 0)
13984 return 0;
13985
13986 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13987 return _PyUnicodeWriter_WriteStr(writer, str);
13988
13989 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13990 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13991 else
13992 maxchar = writer->maxchar;
13993 len = end - start;
13994
13995 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13996 return -1;
13997
13998 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13999 str, start, len);
14000 writer->pos += len;
14001 return 0;
14002}
14003
14004int
Victor Stinner4a587072013-11-19 12:54:53 +010014005_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14006 const char *ascii, Py_ssize_t len)
14007{
14008 if (len == -1)
14009 len = strlen(ascii);
14010
Andy Lestere6be9b52020-02-11 20:28:35 -060014011 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010014012
14013 if (writer->buffer == NULL && !writer->overallocate) {
14014 PyObject *str;
14015
14016 str = _PyUnicode_FromASCII(ascii, len);
14017 if (str == NULL)
14018 return -1;
14019
14020 writer->readonly = 1;
14021 writer->buffer = str;
14022 _PyUnicodeWriter_Update(writer);
14023 writer->pos += len;
14024 return 0;
14025 }
14026
14027 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14028 return -1;
14029
14030 switch (writer->kind)
14031 {
14032 case PyUnicode_1BYTE_KIND:
14033 {
14034 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14035 Py_UCS1 *data = writer->data;
14036
Christian Heimesf051e432016-09-13 20:22:02 +020014037 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010014038 break;
14039 }
14040 case PyUnicode_2BYTE_KIND:
14041 {
14042 _PyUnicode_CONVERT_BYTES(
14043 Py_UCS1, Py_UCS2,
14044 ascii, ascii + len,
14045 (Py_UCS2 *)writer->data + writer->pos);
14046 break;
14047 }
14048 case PyUnicode_4BYTE_KIND:
14049 {
14050 _PyUnicode_CONVERT_BYTES(
14051 Py_UCS1, Py_UCS4,
14052 ascii, ascii + len,
14053 (Py_UCS4 *)writer->data + writer->pos);
14054 break;
14055 }
14056 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014057 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010014058 }
14059
14060 writer->pos += len;
14061 return 0;
14062}
14063
14064int
14065_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14066 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020014067{
14068 Py_UCS4 maxchar;
14069
Andy Lestere6be9b52020-02-11 20:28:35 -060014070 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020014071 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14072 return -1;
14073 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14074 writer->pos += len;
14075 return 0;
14076}
14077
Victor Stinnerd3f08822012-05-29 12:57:52 +020014078PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014079_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014080{
Victor Stinner15a0bd32013-07-08 22:29:55 +020014081 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014082
Victor Stinnerd3f08822012-05-29 12:57:52 +020014083 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014084 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020014085 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020014086 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014087
14088 str = writer->buffer;
14089 writer->buffer = NULL;
14090
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014091 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014092 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14093 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014094 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014095
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014096 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14097 PyObject *str2;
14098 str2 = resize_compact(str, writer->pos);
14099 if (str2 == NULL) {
14100 Py_DECREF(str);
14101 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014102 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014103 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014104 }
14105
Victor Stinner15a0bd32013-07-08 22:29:55 +020014106 assert(_PyUnicode_CheckConsistency(str, 1));
14107 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020014108}
14109
Victor Stinnerd3f08822012-05-29 12:57:52 +020014110void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014111_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014112{
14113 Py_CLEAR(writer->buffer);
14114}
14115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014116#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000014117
14118PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000014119 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000014120\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014121Return a formatted version of S, using substitutions from args and kwargs.\n\
14122The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000014123
Eric Smith27bbca62010-11-04 17:06:58 +000014124PyDoc_STRVAR(format_map__doc__,
14125 "S.format_map(mapping) -> str\n\
14126\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014127Return a formatted version of S, using substitutions from mapping.\n\
14128The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014129
INADA Naoki3ae20562017-01-16 20:41:20 +090014130/*[clinic input]
14131str.__format__ as unicode___format__
14132
14133 format_spec: unicode
14134 /
14135
14136Return a formatted version of the string as described by format_spec.
14137[clinic start generated code]*/
14138
Eric Smith4a7d76d2008-05-30 18:10:19 +000014139static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014140unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014141/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014142{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014143 _PyUnicodeWriter writer;
14144 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014145
Victor Stinnerd3f08822012-05-29 12:57:52 +020014146 if (PyUnicode_READY(self) == -1)
14147 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014148 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014149 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14150 self, format_spec, 0,
14151 PyUnicode_GET_LENGTH(format_spec));
14152 if (ret == -1) {
14153 _PyUnicodeWriter_Dealloc(&writer);
14154 return NULL;
14155 }
14156 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014157}
14158
INADA Naoki3ae20562017-01-16 20:41:20 +090014159/*[clinic input]
14160str.__sizeof__ as unicode_sizeof
14161
14162Return the size of the string in memory, in bytes.
14163[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014164
14165static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014166unicode_sizeof_impl(PyObject *self)
14167/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014168{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014169 Py_ssize_t size;
14170
14171 /* If it's a compact object, account for base structure +
14172 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014173 if (PyUnicode_IS_COMPACT_ASCII(self))
14174 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14175 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014176 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014177 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014178 else {
14179 /* If it is a two-block object, account for base object, and
14180 for character block if present. */
14181 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014182 if (_PyUnicode_DATA_ANY(self))
14183 size += (PyUnicode_GET_LENGTH(self) + 1) *
14184 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014185 }
14186 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014187 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014188 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14189 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14190 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14191 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014192
14193 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014194}
14195
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014196static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014197unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014198{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014199 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014200 if (!copy)
14201 return NULL;
14202 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014203}
14204
Guido van Rossumd57fd912000-03-10 22:53:23 +000014205static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014206 UNICODE_ENCODE_METHODDEF
14207 UNICODE_REPLACE_METHODDEF
14208 UNICODE_SPLIT_METHODDEF
14209 UNICODE_RSPLIT_METHODDEF
14210 UNICODE_JOIN_METHODDEF
14211 UNICODE_CAPITALIZE_METHODDEF
14212 UNICODE_CASEFOLD_METHODDEF
14213 UNICODE_TITLE_METHODDEF
14214 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014215 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014216 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014217 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014218 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014219 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014220 UNICODE_LJUST_METHODDEF
14221 UNICODE_LOWER_METHODDEF
14222 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014223 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14224 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014225 UNICODE_RJUST_METHODDEF
14226 UNICODE_RSTRIP_METHODDEF
14227 UNICODE_RPARTITION_METHODDEF
14228 UNICODE_SPLITLINES_METHODDEF
14229 UNICODE_STRIP_METHODDEF
14230 UNICODE_SWAPCASE_METHODDEF
14231 UNICODE_TRANSLATE_METHODDEF
14232 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014233 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14234 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
sweeneydea81849b2020-04-22 17:05:48 -040014235 UNICODE_REMOVEPREFIX_METHODDEF
14236 UNICODE_REMOVESUFFIX_METHODDEF
INADA Naokia49ac992018-01-27 14:06:21 +090014237 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014238 UNICODE_ISLOWER_METHODDEF
14239 UNICODE_ISUPPER_METHODDEF
14240 UNICODE_ISTITLE_METHODDEF
14241 UNICODE_ISSPACE_METHODDEF
14242 UNICODE_ISDECIMAL_METHODDEF
14243 UNICODE_ISDIGIT_METHODDEF
14244 UNICODE_ISNUMERIC_METHODDEF
14245 UNICODE_ISALPHA_METHODDEF
14246 UNICODE_ISALNUM_METHODDEF
14247 UNICODE_ISIDENTIFIER_METHODDEF
14248 UNICODE_ISPRINTABLE_METHODDEF
14249 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014250 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014251 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014252 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014253 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014254 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014255#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014256 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014257 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014258#endif
14259
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014260 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014261 {NULL, NULL}
14262};
14263
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014264static PyObject *
14265unicode_mod(PyObject *v, PyObject *w)
14266{
Brian Curtindfc80e32011-08-10 20:28:54 -050014267 if (!PyUnicode_Check(v))
14268 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014269 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014270}
14271
14272static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014273 0, /*nb_add*/
14274 0, /*nb_subtract*/
14275 0, /*nb_multiply*/
14276 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014277};
14278
Guido van Rossumd57fd912000-03-10 22:53:23 +000014279static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014280 (lenfunc) unicode_length, /* sq_length */
14281 PyUnicode_Concat, /* sq_concat */
14282 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14283 (ssizeargfunc) unicode_getitem, /* sq_item */
14284 0, /* sq_slice */
14285 0, /* sq_ass_item */
14286 0, /* sq_ass_slice */
14287 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014288};
14289
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014290static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014291unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014292{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014293 if (PyUnicode_READY(self) == -1)
14294 return NULL;
14295
Victor Stinnera15e2602020-04-08 02:01:56 +020014296 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014297 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014298 if (i == -1 && PyErr_Occurred())
14299 return NULL;
14300 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014301 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014302 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014303 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014304 Py_ssize_t start, stop, step, slicelength, i;
14305 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014306 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014307 const void *src_data;
14308 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014309 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014310 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014311
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014312 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014313 return NULL;
14314 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014315 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14316 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014317
14318 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014319 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014320 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014321 slicelength == PyUnicode_GET_LENGTH(self)) {
14322 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014323 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014324 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014325 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014326 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014327 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014328 src_kind = PyUnicode_KIND(self);
14329 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014330 if (!PyUnicode_IS_ASCII(self)) {
14331 kind_limit = kind_maxchar_limit(src_kind);
14332 max_char = 0;
14333 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14334 ch = PyUnicode_READ(src_kind, src_data, cur);
14335 if (ch > max_char) {
14336 max_char = ch;
14337 if (max_char >= kind_limit)
14338 break;
14339 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014340 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014341 }
Victor Stinner55c99112011-10-13 01:17:06 +020014342 else
14343 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014344 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014345 if (result == NULL)
14346 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014347 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014348 dest_data = PyUnicode_DATA(result);
14349
14350 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014351 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14352 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014353 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014354 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014355 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014356 } else {
14357 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14358 return NULL;
14359 }
14360}
14361
14362static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014363 (lenfunc)unicode_length, /* mp_length */
14364 (binaryfunc)unicode_subscript, /* mp_subscript */
14365 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014366};
14367
Guido van Rossumd57fd912000-03-10 22:53:23 +000014368
Guido van Rossumd57fd912000-03-10 22:53:23 +000014369/* Helpers for PyUnicode_Format() */
14370
Victor Stinnera47082312012-10-04 02:19:54 +020014371struct unicode_formatter_t {
14372 PyObject *args;
14373 int args_owned;
14374 Py_ssize_t arglen, argidx;
14375 PyObject *dict;
14376
14377 enum PyUnicode_Kind fmtkind;
14378 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014379 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014380 PyObject *fmtstr;
14381
14382 _PyUnicodeWriter writer;
14383};
14384
14385struct unicode_format_arg_t {
14386 Py_UCS4 ch;
14387 int flags;
14388 Py_ssize_t width;
14389 int prec;
14390 int sign;
14391};
14392
Guido van Rossumd57fd912000-03-10 22:53:23 +000014393static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014394unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014395{
Victor Stinnera47082312012-10-04 02:19:54 +020014396 Py_ssize_t argidx = ctx->argidx;
14397
14398 if (argidx < ctx->arglen) {
14399 ctx->argidx++;
14400 if (ctx->arglen < 0)
14401 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014402 else
Victor Stinnera47082312012-10-04 02:19:54 +020014403 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014404 }
14405 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014406 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014407 return NULL;
14408}
14409
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014410/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014411
Victor Stinnera47082312012-10-04 02:19:54 +020014412/* Format a float into the writer if the writer is not NULL, or into *p_output
14413 otherwise.
14414
14415 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014416static int
Victor Stinnera47082312012-10-04 02:19:54 +020014417formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14418 PyObject **p_output,
14419 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014420{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014421 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014422 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014423 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014424 int prec;
14425 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014426
Guido van Rossumd57fd912000-03-10 22:53:23 +000014427 x = PyFloat_AsDouble(v);
14428 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014429 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014430
Victor Stinnera47082312012-10-04 02:19:54 +020014431 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014432 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014433 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014434
Victor Stinnera47082312012-10-04 02:19:54 +020014435 if (arg->flags & F_ALT)
14436 dtoa_flags = Py_DTSF_ALT;
14437 else
14438 dtoa_flags = 0;
14439 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014440 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014441 return -1;
14442 len = strlen(p);
14443 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014444 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014445 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014446 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014447 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014448 }
14449 else
14450 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014451 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014452 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014453}
14454
Victor Stinnerd0880d52012-04-27 23:40:13 +020014455/* formatlong() emulates the format codes d, u, o, x and X, and
14456 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14457 * Python's regular ints.
14458 * Return value: a new PyUnicodeObject*, or NULL if error.
14459 * The output string is of the form
14460 * "-"? ("0x" | "0X")? digit+
14461 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14462 * set in flags. The case of hex digits will be correct,
14463 * There will be at least prec digits, zero-filled on the left if
14464 * necessary to get that many.
14465 * val object to be converted
14466 * flags bitmask of format flags; only F_ALT is looked at
14467 * prec minimum number of digits; 0-fill on left if needed
14468 * type a character in [duoxX]; u acts the same as d
14469 *
14470 * CAUTION: o, x and X conversions on regular ints can never
14471 * produce a '-' sign, but can for Python's unbounded ints.
14472 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014473PyObject *
14474_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014475{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014476 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014477 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014478 Py_ssize_t i;
14479 int sign; /* 1 if '-', else 0 */
14480 int len; /* number of characters */
14481 Py_ssize_t llen;
14482 int numdigits; /* len == numnondigits + numdigits */
14483 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014484
Victor Stinnerd0880d52012-04-27 23:40:13 +020014485 /* Avoid exceeding SSIZE_T_MAX */
14486 if (prec > INT_MAX-3) {
14487 PyErr_SetString(PyExc_OverflowError,
14488 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014489 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014490 }
14491
14492 assert(PyLong_Check(val));
14493
14494 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014495 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014496 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014497 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014498 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014499 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014500 /* int and int subclasses should print numerically when a numeric */
14501 /* format code is used (see issue18780) */
14502 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014503 break;
14504 case 'o':
14505 numnondigits = 2;
14506 result = PyNumber_ToBase(val, 8);
14507 break;
14508 case 'x':
14509 case 'X':
14510 numnondigits = 2;
14511 result = PyNumber_ToBase(val, 16);
14512 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014513 }
14514 if (!result)
14515 return NULL;
14516
14517 assert(unicode_modifiable(result));
14518 assert(PyUnicode_IS_READY(result));
14519 assert(PyUnicode_IS_ASCII(result));
14520
14521 /* To modify the string in-place, there can only be one reference. */
14522 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014523 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014524 PyErr_BadInternalCall();
14525 return NULL;
14526 }
14527 buf = PyUnicode_DATA(result);
14528 llen = PyUnicode_GET_LENGTH(result);
14529 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014530 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014531 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014532 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014533 return NULL;
14534 }
14535 len = (int)llen;
14536 sign = buf[0] == '-';
14537 numnondigits += sign;
14538 numdigits = len - numnondigits;
14539 assert(numdigits > 0);
14540
14541 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014542 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014543 (type == 'o' || type == 'x' || type == 'X'))) {
14544 assert(buf[sign] == '0');
14545 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14546 buf[sign+1] == 'o');
14547 numnondigits -= 2;
14548 buf += 2;
14549 len -= 2;
14550 if (sign)
14551 buf[0] = '-';
14552 assert(len == numnondigits + numdigits);
14553 assert(numdigits > 0);
14554 }
14555
14556 /* Fill with leading zeroes to meet minimum width. */
14557 if (prec > numdigits) {
14558 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14559 numnondigits + prec);
14560 char *b1;
14561 if (!r1) {
14562 Py_DECREF(result);
14563 return NULL;
14564 }
14565 b1 = PyBytes_AS_STRING(r1);
14566 for (i = 0; i < numnondigits; ++i)
14567 *b1++ = *buf++;
14568 for (i = 0; i < prec - numdigits; i++)
14569 *b1++ = '0';
14570 for (i = 0; i < numdigits; i++)
14571 *b1++ = *buf++;
14572 *b1 = '\0';
14573 Py_DECREF(result);
14574 result = r1;
14575 buf = PyBytes_AS_STRING(result);
14576 len = numnondigits + prec;
14577 }
14578
14579 /* Fix up case for hex conversions. */
14580 if (type == 'X') {
14581 /* Need to convert all lower case letters to upper case.
14582 and need to convert 0x to 0X (and -0x to -0X). */
14583 for (i = 0; i < len; i++)
14584 if (buf[i] >= 'a' && buf[i] <= 'x')
14585 buf[i] -= 'a'-'A';
14586 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014587 if (!PyUnicode_Check(result)
14588 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014589 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014590 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014591 Py_DECREF(result);
14592 result = unicode;
14593 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014594 else if (len != PyUnicode_GET_LENGTH(result)) {
14595 if (PyUnicode_Resize(&result, len) < 0)
14596 Py_CLEAR(result);
14597 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014598 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014599}
14600
Ethan Furmandf3ed242014-01-05 06:50:30 -080014601/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014602 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014603 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014604 * -1 and raise an exception on error */
14605static int
Victor Stinnera47082312012-10-04 02:19:54 +020014606mainformatlong(PyObject *v,
14607 struct unicode_format_arg_t *arg,
14608 PyObject **p_output,
14609 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014610{
14611 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014612 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014613
14614 if (!PyNumber_Check(v))
14615 goto wrongtype;
14616
Ethan Furman9ab74802014-03-21 06:38:46 -070014617 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014618 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014619 if (type == 'o' || type == 'x' || type == 'X') {
14620 iobj = PyNumber_Index(v);
14621 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014622 if (PyErr_ExceptionMatches(PyExc_TypeError))
14623 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014624 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014625 }
14626 }
14627 else {
14628 iobj = PyNumber_Long(v);
14629 if (iobj == NULL ) {
14630 if (PyErr_ExceptionMatches(PyExc_TypeError))
14631 goto wrongtype;
14632 return -1;
14633 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014634 }
14635 assert(PyLong_Check(iobj));
14636 }
14637 else {
14638 iobj = v;
14639 Py_INCREF(iobj);
14640 }
14641
14642 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014643 && arg->width == -1 && arg->prec == -1
14644 && !(arg->flags & (F_SIGN | F_BLANK))
14645 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014646 {
14647 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014648 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014649 int base;
14650
Victor Stinnera47082312012-10-04 02:19:54 +020014651 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014652 {
14653 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014654 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014655 case 'd':
14656 case 'i':
14657 case 'u':
14658 base = 10;
14659 break;
14660 case 'o':
14661 base = 8;
14662 break;
14663 case 'x':
14664 case 'X':
14665 base = 16;
14666 break;
14667 }
14668
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014669 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14670 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014671 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014672 }
14673 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014674 return 1;
14675 }
14676
Ethan Furmanb95b5612015-01-23 20:05:18 -080014677 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014678 Py_DECREF(iobj);
14679 if (res == NULL)
14680 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014681 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014682 return 0;
14683
14684wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014685 switch(type)
14686 {
14687 case 'o':
14688 case 'x':
14689 case 'X':
14690 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014691 "%%%c format: an integer is required, "
14692 "not %.200s",
14693 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014694 break;
14695 default:
14696 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014697 "%%%c format: a number is required, "
14698 "not %.200s",
14699 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014700 break;
14701 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014702 return -1;
14703}
14704
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014705static Py_UCS4
14706formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014707{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014708 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014709 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014710 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014711 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014712 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014713 goto onError;
14714 }
14715 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014716 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014717 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014718 /* make sure number is a type of integer */
14719 if (!PyLong_Check(v)) {
14720 iobj = PyNumber_Index(v);
14721 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014722 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014723 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014724 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014725 Py_DECREF(iobj);
14726 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014727 else {
14728 x = PyLong_AsLong(v);
14729 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014730 if (x == -1 && PyErr_Occurred())
14731 goto onError;
14732
Victor Stinner8faf8212011-12-08 22:14:11 +010014733 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014734 PyErr_SetString(PyExc_OverflowError,
14735 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014736 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014737 }
14738
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014739 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014740 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014741
Benjamin Peterson29060642009-01-31 22:14:21 +000014742 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014743 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014744 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014745 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014746}
14747
Victor Stinnera47082312012-10-04 02:19:54 +020014748/* Parse options of an argument: flags, width, precision.
14749 Handle also "%(name)" syntax.
14750
14751 Return 0 if the argument has been formatted into arg->str.
14752 Return 1 if the argument has been written into ctx->writer,
14753 Raise an exception and return -1 on error. */
14754static int
14755unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14756 struct unicode_format_arg_t *arg)
14757{
14758#define FORMAT_READ(ctx) \
14759 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14760
14761 PyObject *v;
14762
Victor Stinnera47082312012-10-04 02:19:54 +020014763 if (arg->ch == '(') {
14764 /* Get argument value from a dictionary. Example: "%(name)s". */
14765 Py_ssize_t keystart;
14766 Py_ssize_t keylen;
14767 PyObject *key;
14768 int pcount = 1;
14769
14770 if (ctx->dict == NULL) {
14771 PyErr_SetString(PyExc_TypeError,
14772 "format requires a mapping");
14773 return -1;
14774 }
14775 ++ctx->fmtpos;
14776 --ctx->fmtcnt;
14777 keystart = ctx->fmtpos;
14778 /* Skip over balanced parentheses */
14779 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14780 arg->ch = FORMAT_READ(ctx);
14781 if (arg->ch == ')')
14782 --pcount;
14783 else if (arg->ch == '(')
14784 ++pcount;
14785 ctx->fmtpos++;
14786 }
14787 keylen = ctx->fmtpos - keystart - 1;
14788 if (ctx->fmtcnt < 0 || pcount > 0) {
14789 PyErr_SetString(PyExc_ValueError,
14790 "incomplete format key");
14791 return -1;
14792 }
14793 key = PyUnicode_Substring(ctx->fmtstr,
14794 keystart, keystart + keylen);
14795 if (key == NULL)
14796 return -1;
14797 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014798 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014799 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014800 }
14801 ctx->args = PyObject_GetItem(ctx->dict, key);
14802 Py_DECREF(key);
14803 if (ctx->args == NULL)
14804 return -1;
14805 ctx->args_owned = 1;
14806 ctx->arglen = -1;
14807 ctx->argidx = -2;
14808 }
14809
14810 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014811 while (--ctx->fmtcnt >= 0) {
14812 arg->ch = FORMAT_READ(ctx);
14813 ctx->fmtpos++;
14814 switch (arg->ch) {
14815 case '-': arg->flags |= F_LJUST; continue;
14816 case '+': arg->flags |= F_SIGN; continue;
14817 case ' ': arg->flags |= F_BLANK; continue;
14818 case '#': arg->flags |= F_ALT; continue;
14819 case '0': arg->flags |= F_ZERO; continue;
14820 }
14821 break;
14822 }
14823
14824 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014825 if (arg->ch == '*') {
14826 v = unicode_format_getnextarg(ctx);
14827 if (v == NULL)
14828 return -1;
14829 if (!PyLong_Check(v)) {
14830 PyErr_SetString(PyExc_TypeError,
14831 "* wants int");
14832 return -1;
14833 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014834 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014835 if (arg->width == -1 && PyErr_Occurred())
14836 return -1;
14837 if (arg->width < 0) {
14838 arg->flags |= F_LJUST;
14839 arg->width = -arg->width;
14840 }
14841 if (--ctx->fmtcnt >= 0) {
14842 arg->ch = FORMAT_READ(ctx);
14843 ctx->fmtpos++;
14844 }
14845 }
14846 else if (arg->ch >= '0' && arg->ch <= '9') {
14847 arg->width = arg->ch - '0';
14848 while (--ctx->fmtcnt >= 0) {
14849 arg->ch = FORMAT_READ(ctx);
14850 ctx->fmtpos++;
14851 if (arg->ch < '0' || arg->ch > '9')
14852 break;
14853 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14854 mixing signed and unsigned comparison. Since arg->ch is between
14855 '0' and '9', casting to int is safe. */
14856 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14857 PyErr_SetString(PyExc_ValueError,
14858 "width too big");
14859 return -1;
14860 }
14861 arg->width = arg->width*10 + (arg->ch - '0');
14862 }
14863 }
14864
14865 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014866 if (arg->ch == '.') {
14867 arg->prec = 0;
14868 if (--ctx->fmtcnt >= 0) {
14869 arg->ch = FORMAT_READ(ctx);
14870 ctx->fmtpos++;
14871 }
14872 if (arg->ch == '*') {
14873 v = unicode_format_getnextarg(ctx);
14874 if (v == NULL)
14875 return -1;
14876 if (!PyLong_Check(v)) {
14877 PyErr_SetString(PyExc_TypeError,
14878 "* wants int");
14879 return -1;
14880 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014881 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014882 if (arg->prec == -1 && PyErr_Occurred())
14883 return -1;
14884 if (arg->prec < 0)
14885 arg->prec = 0;
14886 if (--ctx->fmtcnt >= 0) {
14887 arg->ch = FORMAT_READ(ctx);
14888 ctx->fmtpos++;
14889 }
14890 }
14891 else if (arg->ch >= '0' && arg->ch <= '9') {
14892 arg->prec = arg->ch - '0';
14893 while (--ctx->fmtcnt >= 0) {
14894 arg->ch = FORMAT_READ(ctx);
14895 ctx->fmtpos++;
14896 if (arg->ch < '0' || arg->ch > '9')
14897 break;
14898 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14899 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014900 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014901 return -1;
14902 }
14903 arg->prec = arg->prec*10 + (arg->ch - '0');
14904 }
14905 }
14906 }
14907
14908 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14909 if (ctx->fmtcnt >= 0) {
14910 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14911 if (--ctx->fmtcnt >= 0) {
14912 arg->ch = FORMAT_READ(ctx);
14913 ctx->fmtpos++;
14914 }
14915 }
14916 }
14917 if (ctx->fmtcnt < 0) {
14918 PyErr_SetString(PyExc_ValueError,
14919 "incomplete format");
14920 return -1;
14921 }
14922 return 0;
14923
14924#undef FORMAT_READ
14925}
14926
14927/* Format one argument. Supported conversion specifiers:
14928
14929 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014930 - "i", "d", "u": int or float
14931 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014932 - "e", "E", "f", "F", "g", "G": float
14933 - "c": int or str (1 character)
14934
Victor Stinner8dbd4212012-12-04 09:30:24 +010014935 When possible, the output is written directly into the Unicode writer
14936 (ctx->writer). A string is created when padding is required.
14937
Victor Stinnera47082312012-10-04 02:19:54 +020014938 Return 0 if the argument has been formatted into *p_str,
14939 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014940 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014941static int
14942unicode_format_arg_format(struct unicode_formatter_t *ctx,
14943 struct unicode_format_arg_t *arg,
14944 PyObject **p_str)
14945{
14946 PyObject *v;
14947 _PyUnicodeWriter *writer = &ctx->writer;
14948
14949 if (ctx->fmtcnt == 0)
14950 ctx->writer.overallocate = 0;
14951
Victor Stinnera47082312012-10-04 02:19:54 +020014952 v = unicode_format_getnextarg(ctx);
14953 if (v == NULL)
14954 return -1;
14955
Victor Stinnera47082312012-10-04 02:19:54 +020014956
14957 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014958 case 's':
14959 case 'r':
14960 case 'a':
14961 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14962 /* Fast path */
14963 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14964 return -1;
14965 return 1;
14966 }
14967
14968 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14969 *p_str = v;
14970 Py_INCREF(*p_str);
14971 }
14972 else {
14973 if (arg->ch == 's')
14974 *p_str = PyObject_Str(v);
14975 else if (arg->ch == 'r')
14976 *p_str = PyObject_Repr(v);
14977 else
14978 *p_str = PyObject_ASCII(v);
14979 }
14980 break;
14981
14982 case 'i':
14983 case 'd':
14984 case 'u':
14985 case 'o':
14986 case 'x':
14987 case 'X':
14988 {
14989 int ret = mainformatlong(v, arg, p_str, writer);
14990 if (ret != 0)
14991 return ret;
14992 arg->sign = 1;
14993 break;
14994 }
14995
14996 case 'e':
14997 case 'E':
14998 case 'f':
14999 case 'F':
15000 case 'g':
15001 case 'G':
15002 if (arg->width == -1 && arg->prec == -1
15003 && !(arg->flags & (F_SIGN | F_BLANK)))
15004 {
15005 /* Fast path */
15006 if (formatfloat(v, arg, NULL, writer) == -1)
15007 return -1;
15008 return 1;
15009 }
15010
15011 arg->sign = 1;
15012 if (formatfloat(v, arg, p_str, NULL) == -1)
15013 return -1;
15014 break;
15015
15016 case 'c':
15017 {
15018 Py_UCS4 ch = formatchar(v);
15019 if (ch == (Py_UCS4) -1)
15020 return -1;
15021 if (arg->width == -1 && arg->prec == -1) {
15022 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020015023 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020015024 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020015025 return 1;
15026 }
15027 *p_str = PyUnicode_FromOrdinal(ch);
15028 break;
15029 }
15030
15031 default:
15032 PyErr_Format(PyExc_ValueError,
15033 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020015034 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020015035 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15036 (int)arg->ch,
15037 ctx->fmtpos - 1);
15038 return -1;
15039 }
15040 if (*p_str == NULL)
15041 return -1;
15042 assert (PyUnicode_Check(*p_str));
15043 return 0;
15044}
15045
15046static int
15047unicode_format_arg_output(struct unicode_formatter_t *ctx,
15048 struct unicode_format_arg_t *arg,
15049 PyObject *str)
15050{
15051 Py_ssize_t len;
15052 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015053 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020015054 Py_ssize_t pindex;
15055 Py_UCS4 signchar;
15056 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015057 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015058 Py_ssize_t sublen;
15059 _PyUnicodeWriter *writer = &ctx->writer;
15060 Py_UCS4 fill;
15061
15062 fill = ' ';
15063 if (arg->sign && arg->flags & F_ZERO)
15064 fill = '0';
15065
15066 if (PyUnicode_READY(str) == -1)
15067 return -1;
15068
15069 len = PyUnicode_GET_LENGTH(str);
15070 if ((arg->width == -1 || arg->width <= len)
15071 && (arg->prec == -1 || arg->prec >= len)
15072 && !(arg->flags & (F_SIGN | F_BLANK)))
15073 {
15074 /* Fast path */
15075 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15076 return -1;
15077 return 0;
15078 }
15079
15080 /* Truncate the string for "s", "r" and "a" formats
15081 if the precision is set */
15082 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15083 if (arg->prec >= 0 && len > arg->prec)
15084 len = arg->prec;
15085 }
15086
15087 /* Adjust sign and width */
15088 kind = PyUnicode_KIND(str);
15089 pbuf = PyUnicode_DATA(str);
15090 pindex = 0;
15091 signchar = '\0';
15092 if (arg->sign) {
15093 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15094 if (ch == '-' || ch == '+') {
15095 signchar = ch;
15096 len--;
15097 pindex++;
15098 }
15099 else if (arg->flags & F_SIGN)
15100 signchar = '+';
15101 else if (arg->flags & F_BLANK)
15102 signchar = ' ';
15103 else
15104 arg->sign = 0;
15105 }
15106 if (arg->width < len)
15107 arg->width = len;
15108
15109 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015110 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015111 if (!(arg->flags & F_LJUST)) {
15112 if (arg->sign) {
15113 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015114 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015115 }
15116 else {
15117 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015118 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015119 }
15120 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015121 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15122 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015123 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015124 }
15125
Victor Stinnera47082312012-10-04 02:19:54 +020015126 buflen = arg->width;
15127 if (arg->sign && len == arg->width)
15128 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015129 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015130 return -1;
15131
15132 /* Write the sign if needed */
15133 if (arg->sign) {
15134 if (fill != ' ') {
15135 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15136 writer->pos += 1;
15137 }
15138 if (arg->width > len)
15139 arg->width--;
15140 }
15141
15142 /* Write the numeric prefix for "x", "X" and "o" formats
15143 if the alternate form is used.
15144 For example, write "0x" for the "%#x" format. */
15145 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15146 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15147 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15148 if (fill != ' ') {
15149 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15150 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15151 writer->pos += 2;
15152 pindex += 2;
15153 }
15154 arg->width -= 2;
15155 if (arg->width < 0)
15156 arg->width = 0;
15157 len -= 2;
15158 }
15159
15160 /* Pad left with the fill character if needed */
15161 if (arg->width > len && !(arg->flags & F_LJUST)) {
15162 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015163 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015164 writer->pos += sublen;
15165 arg->width = len;
15166 }
15167
15168 /* If padding with spaces: write sign if needed and/or numeric prefix if
15169 the alternate form is used */
15170 if (fill == ' ') {
15171 if (arg->sign) {
15172 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15173 writer->pos += 1;
15174 }
15175 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15176 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15177 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15178 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15179 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15180 writer->pos += 2;
15181 pindex += 2;
15182 }
15183 }
15184
15185 /* Write characters */
15186 if (len) {
15187 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15188 str, pindex, len);
15189 writer->pos += len;
15190 }
15191
15192 /* Pad right with the fill character if needed */
15193 if (arg->width > len) {
15194 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015195 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015196 writer->pos += sublen;
15197 }
15198 return 0;
15199}
15200
15201/* Helper of PyUnicode_Format(): format one arg.
15202 Return 0 on success, raise an exception and return -1 on error. */
15203static int
15204unicode_format_arg(struct unicode_formatter_t *ctx)
15205{
15206 struct unicode_format_arg_t arg;
15207 PyObject *str;
15208 int ret;
15209
Victor Stinner8dbd4212012-12-04 09:30:24 +010015210 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015211 if (arg.ch == '%') {
15212 ctx->fmtpos++;
15213 ctx->fmtcnt--;
15214 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15215 return -1;
15216 return 0;
15217 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015218 arg.flags = 0;
15219 arg.width = -1;
15220 arg.prec = -1;
15221 arg.sign = 0;
15222 str = NULL;
15223
Victor Stinnera47082312012-10-04 02:19:54 +020015224 ret = unicode_format_arg_parse(ctx, &arg);
15225 if (ret == -1)
15226 return -1;
15227
15228 ret = unicode_format_arg_format(ctx, &arg, &str);
15229 if (ret == -1)
15230 return -1;
15231
15232 if (ret != 1) {
15233 ret = unicode_format_arg_output(ctx, &arg, str);
15234 Py_DECREF(str);
15235 if (ret == -1)
15236 return -1;
15237 }
15238
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015239 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015240 PyErr_SetString(PyExc_TypeError,
15241 "not all arguments converted during string formatting");
15242 return -1;
15243 }
15244 return 0;
15245}
15246
Alexander Belopolsky40018472011-02-26 01:02:56 +000015247PyObject *
15248PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015249{
Victor Stinnera47082312012-10-04 02:19:54 +020015250 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015251
Guido van Rossumd57fd912000-03-10 22:53:23 +000015252 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015253 PyErr_BadInternalCall();
15254 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015255 }
Victor Stinnera47082312012-10-04 02:19:54 +020015256
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015257 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015258 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015259
15260 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015261 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15262 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15263 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15264 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015265
Victor Stinner8f674cc2013-04-17 23:02:17 +020015266 _PyUnicodeWriter_Init(&ctx.writer);
15267 ctx.writer.min_length = ctx.fmtcnt + 100;
15268 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015269
Guido van Rossumd57fd912000-03-10 22:53:23 +000015270 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015271 ctx.arglen = PyTuple_Size(args);
15272 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015273 }
15274 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015275 ctx.arglen = -1;
15276 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015277 }
Victor Stinnera47082312012-10-04 02:19:54 +020015278 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015279 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015280 ctx.dict = args;
15281 else
15282 ctx.dict = NULL;
15283 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015284
Victor Stinnera47082312012-10-04 02:19:54 +020015285 while (--ctx.fmtcnt >= 0) {
15286 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015287 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015288
15289 nonfmtpos = ctx.fmtpos++;
15290 while (ctx.fmtcnt >= 0 &&
15291 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15292 ctx.fmtpos++;
15293 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015294 }
Victor Stinnera47082312012-10-04 02:19:54 +020015295 if (ctx.fmtcnt < 0) {
15296 ctx.fmtpos--;
15297 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015298 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015299
Victor Stinnercfc4c132013-04-03 01:48:39 +020015300 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15301 nonfmtpos, ctx.fmtpos) < 0)
15302 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015303 }
15304 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015305 ctx.fmtpos++;
15306 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015307 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015308 }
15309 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015310
Victor Stinnera47082312012-10-04 02:19:54 +020015311 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015312 PyErr_SetString(PyExc_TypeError,
15313 "not all arguments converted during string formatting");
15314 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015315 }
15316
Victor Stinnera47082312012-10-04 02:19:54 +020015317 if (ctx.args_owned) {
15318 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015319 }
Victor Stinnera47082312012-10-04 02:19:54 +020015320 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015321
Benjamin Peterson29060642009-01-31 22:14:21 +000015322 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015323 _PyUnicodeWriter_Dealloc(&ctx.writer);
15324 if (ctx.args_owned) {
15325 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015326 }
15327 return NULL;
15328}
15329
Jeremy Hylton938ace62002-07-17 16:30:39 +000015330static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015331unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15332
Tim Peters6d6c1a32001-08-02 04:15:00 +000015333static PyObject *
15334unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15335{
Benjamin Peterson29060642009-01-31 22:14:21 +000015336 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015337 static char *kwlist[] = {"object", "encoding", "errors", 0};
15338 char *encoding = NULL;
15339 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015340
Benjamin Peterson14339b62009-01-31 16:36:08 +000015341 if (type != &PyUnicode_Type)
15342 return unicode_subtype_new(type, args, kwds);
15343 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015344 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015345 return NULL;
15346 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015347 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015348 if (encoding == NULL && errors == NULL)
15349 return PyObject_Str(x);
15350 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015351 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015352}
15353
Guido van Rossume023fe02001-08-30 03:12:59 +000015354static PyObject *
15355unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15356{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015357 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015358 Py_ssize_t length, char_size;
15359 int share_wstr, share_utf8;
15360 unsigned int kind;
15361 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015362
Benjamin Peterson14339b62009-01-31 16:36:08 +000015363 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015364
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015365 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015366 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015367 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015368 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015369 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015370 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015371 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015372 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015373
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015374 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015375 if (self == NULL) {
15376 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015377 return NULL;
15378 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015379 kind = PyUnicode_KIND(unicode);
15380 length = PyUnicode_GET_LENGTH(unicode);
15381
15382 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015383#ifdef Py_DEBUG
15384 _PyUnicode_HASH(self) = -1;
15385#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015386 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015387#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015388 _PyUnicode_STATE(self).interned = 0;
15389 _PyUnicode_STATE(self).kind = kind;
15390 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015391 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015392 _PyUnicode_STATE(self).ready = 1;
15393 _PyUnicode_WSTR(self) = NULL;
15394 _PyUnicode_UTF8_LENGTH(self) = 0;
15395 _PyUnicode_UTF8(self) = NULL;
15396 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015397 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015398
15399 share_utf8 = 0;
15400 share_wstr = 0;
15401 if (kind == PyUnicode_1BYTE_KIND) {
15402 char_size = 1;
15403 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15404 share_utf8 = 1;
15405 }
15406 else if (kind == PyUnicode_2BYTE_KIND) {
15407 char_size = 2;
15408 if (sizeof(wchar_t) == 2)
15409 share_wstr = 1;
15410 }
15411 else {
15412 assert(kind == PyUnicode_4BYTE_KIND);
15413 char_size = 4;
15414 if (sizeof(wchar_t) == 4)
15415 share_wstr = 1;
15416 }
15417
15418 /* Ensure we won't overflow the length. */
15419 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15420 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015421 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015422 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015423 data = PyObject_MALLOC((length + 1) * char_size);
15424 if (data == NULL) {
15425 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015426 goto onError;
15427 }
15428
Victor Stinnerc3c74152011-10-02 20:39:55 +020015429 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015430 if (share_utf8) {
15431 _PyUnicode_UTF8_LENGTH(self) = length;
15432 _PyUnicode_UTF8(self) = data;
15433 }
15434 if (share_wstr) {
15435 _PyUnicode_WSTR_LENGTH(self) = length;
15436 _PyUnicode_WSTR(self) = (wchar_t *)data;
15437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015438
Christian Heimesf051e432016-09-13 20:22:02 +020015439 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015440 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015441 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015442#ifdef Py_DEBUG
15443 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15444#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015445 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015446 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015447
15448onError:
15449 Py_DECREF(unicode);
15450 Py_DECREF(self);
15451 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015452}
15453
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015454PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015455"str(object='') -> str\n\
15456str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015457\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015458Create a new string object from the given object. If encoding or\n\
15459errors is specified, then the object must expose a data buffer\n\
15460that will be decoded using the given encoding and error handler.\n\
15461Otherwise, returns the result of object.__str__() (if defined)\n\
15462or repr(object).\n\
15463encoding defaults to sys.getdefaultencoding().\n\
15464errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015465
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015466static PyObject *unicode_iter(PyObject *seq);
15467
Guido van Rossumd57fd912000-03-10 22:53:23 +000015468PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015469 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015470 "str", /* tp_name */
15471 sizeof(PyUnicodeObject), /* tp_basicsize */
15472 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015473 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015474 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015475 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015476 0, /* tp_getattr */
15477 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015478 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015479 unicode_repr, /* tp_repr */
15480 &unicode_as_number, /* tp_as_number */
15481 &unicode_as_sequence, /* tp_as_sequence */
15482 &unicode_as_mapping, /* tp_as_mapping */
15483 (hashfunc) unicode_hash, /* tp_hash*/
15484 0, /* tp_call*/
15485 (reprfunc) unicode_str, /* tp_str */
15486 PyObject_GenericGetAttr, /* tp_getattro */
15487 0, /* tp_setattro */
15488 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015489 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015490 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15491 unicode_doc, /* tp_doc */
15492 0, /* tp_traverse */
15493 0, /* tp_clear */
15494 PyUnicode_RichCompare, /* tp_richcompare */
15495 0, /* tp_weaklistoffset */
15496 unicode_iter, /* tp_iter */
15497 0, /* tp_iternext */
15498 unicode_methods, /* tp_methods */
15499 0, /* tp_members */
15500 0, /* tp_getset */
15501 &PyBaseObject_Type, /* tp_base */
15502 0, /* tp_dict */
15503 0, /* tp_descr_get */
15504 0, /* tp_descr_set */
15505 0, /* tp_dictoffset */
15506 0, /* tp_init */
15507 0, /* tp_alloc */
15508 unicode_new, /* tp_new */
15509 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015510};
15511
15512/* Initialize the Unicode implementation */
15513
Victor Stinner331a6a52019-05-27 16:39:22 +020015514PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015515_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015516{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015517 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015518 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015519 0x000A, /* LINE FEED */
15520 0x000D, /* CARRIAGE RETURN */
15521 0x001C, /* FILE SEPARATOR */
15522 0x001D, /* GROUP SEPARATOR */
15523 0x001E, /* RECORD SEPARATOR */
15524 0x0085, /* NEXT LINE */
15525 0x2028, /* LINE SEPARATOR */
15526 0x2029, /* PARAGRAPH SEPARATOR */
15527 };
15528
Fred Drakee4315f52000-05-09 19:53:39 +000015529 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015530 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015531 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015532 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015533 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015534 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015535
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015536 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015537 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015538 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015539
15540 /* initialize the linebreak bloom filter */
15541 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015542 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015543 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015544
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015545 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015546 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015547 }
15548 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015549 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015550 }
15551 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015552 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015553 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015554 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015555}
15556
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015557
Walter Dörwald16807132007-05-25 13:52:07 +000015558void
15559PyUnicode_InternInPlace(PyObject **p)
15560{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015561 PyObject *s = *p;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015562#ifdef Py_DEBUG
15563 assert(s != NULL);
15564 assert(_PyUnicode_CHECK(s));
15565#else
Victor Stinner607b1022020-05-05 18:50:30 +020015566 if (s == NULL || !PyUnicode_Check(s)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020015567 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015568 }
Victor Stinner4fae54c2011-10-03 02:01:52 +020015569#endif
Victor Stinner607b1022020-05-05 18:50:30 +020015570
Benjamin Peterson14339b62009-01-31 16:36:08 +000015571 /* If it's a subclass, we don't really know what putting
15572 it in the interned dict might do. */
Victor Stinner607b1022020-05-05 18:50:30 +020015573 if (!PyUnicode_CheckExact(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015574 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015575 }
15576
15577 if (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015578 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015579 }
15580
15581#ifdef INTERNED_STRINGS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015582 if (interned == NULL) {
15583 interned = PyDict_New();
15584 if (interned == NULL) {
15585 PyErr_Clear(); /* Don't leave an exception */
15586 return;
15587 }
15588 }
Victor Stinner607b1022020-05-05 18:50:30 +020015589
15590 PyObject *t;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015591 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015592 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015593 Py_END_ALLOW_RECURSION
Victor Stinner607b1022020-05-05 18:50:30 +020015594
Berker Peksagced8d4c2016-07-25 04:40:39 +030015595 if (t == NULL) {
15596 PyErr_Clear();
15597 return;
15598 }
Victor Stinner607b1022020-05-05 18:50:30 +020015599
Berker Peksagced8d4c2016-07-25 04:40:39 +030015600 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015601 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015602 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015603 return;
15604 }
Victor Stinner607b1022020-05-05 18:50:30 +020015605
Benjamin Peterson14339b62009-01-31 16:36:08 +000015606 /* The two references in interned are not counted by refcnt.
15607 The deallocator will take care of this */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015608 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015609 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Victor Stinner607b1022020-05-05 18:50:30 +020015610#endif
Walter Dörwald16807132007-05-25 13:52:07 +000015611}
15612
15613void
15614PyUnicode_InternImmortal(PyObject **p)
15615{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015616 PyUnicode_InternInPlace(p);
15617 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015618 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015619 Py_INCREF(*p);
15620 }
Walter Dörwald16807132007-05-25 13:52:07 +000015621}
15622
15623PyObject *
15624PyUnicode_InternFromString(const char *cp)
15625{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015626 PyObject *s = PyUnicode_FromString(cp);
15627 if (s == NULL)
15628 return NULL;
15629 PyUnicode_InternInPlace(&s);
15630 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015631}
15632
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015633
15634#if defined(WITH_VALGRIND) || defined(__INSURE__)
15635static void
15636unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015637{
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015638 if (interned == NULL || !PyDict_Check(interned)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015639 return;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015640 }
15641 PyObject *keys = PyDict_Keys(interned);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015642 if (keys == NULL || !PyList_Check(keys)) {
15643 PyErr_Clear();
15644 return;
15645 }
Walter Dörwald16807132007-05-25 13:52:07 +000015646
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015647 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015648 detector, interned unicode strings are not forcibly deallocated;
15649 rather, we give them their stolen references back, and then clear
15650 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015651
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015652 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015653#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015654 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015655 n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015656
15657 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015658#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015659 for (Py_ssize_t i = 0; i < n; i++) {
15660 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015661 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015662 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015663 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015664 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015665 case SSTATE_INTERNED_IMMORTAL:
15666 Py_REFCNT(s) += 1;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015667#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015668 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015669#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015670 break;
15671 case SSTATE_INTERNED_MORTAL:
15672 Py_REFCNT(s) += 2;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015673#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015674 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015675#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015676 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015677 case SSTATE_NOT_INTERNED:
15678 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015679 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015680 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015681 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015682 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015683 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015684#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015685 fprintf(stderr, "total size of all interned strings: "
15686 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15687 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015688#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015689 Py_DECREF(keys);
15690 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015691 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015692}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015693#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015694
15695
15696/********************* Unicode Iterator **************************/
15697
15698typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015699 PyObject_HEAD
15700 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015701 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015702} unicodeiterobject;
15703
15704static void
15705unicodeiter_dealloc(unicodeiterobject *it)
15706{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015707 _PyObject_GC_UNTRACK(it);
15708 Py_XDECREF(it->it_seq);
15709 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015710}
15711
15712static int
15713unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15714{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015715 Py_VISIT(it->it_seq);
15716 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015717}
15718
15719static PyObject *
15720unicodeiter_next(unicodeiterobject *it)
15721{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015722 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015723
Benjamin Peterson14339b62009-01-31 16:36:08 +000015724 assert(it != NULL);
15725 seq = it->it_seq;
15726 if (seq == NULL)
15727 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015728 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015730 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15731 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015732 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015733 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15734 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015735 if (item != NULL)
15736 ++it->it_index;
15737 return item;
15738 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015739
Benjamin Peterson14339b62009-01-31 16:36:08 +000015740 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015741 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015742 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015743}
15744
15745static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015746unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015747{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015748 Py_ssize_t len = 0;
15749 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015750 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015751 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015752}
15753
15754PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15755
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015756static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015757unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015758{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015759 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015760 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015761 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015762 it->it_seq, it->it_index);
15763 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015764 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015765 if (u == NULL)
15766 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015767 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015768 }
15769}
15770
15771PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15772
15773static PyObject *
15774unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15775{
15776 Py_ssize_t index = PyLong_AsSsize_t(state);
15777 if (index == -1 && PyErr_Occurred())
15778 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015779 if (it->it_seq != NULL) {
15780 if (index < 0)
15781 index = 0;
15782 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15783 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15784 it->it_index = index;
15785 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015786 Py_RETURN_NONE;
15787}
15788
15789PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15790
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015791static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015792 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015793 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015794 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15795 reduce_doc},
15796 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15797 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015798 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015799};
15800
15801PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015802 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15803 "str_iterator", /* tp_name */
15804 sizeof(unicodeiterobject), /* tp_basicsize */
15805 0, /* tp_itemsize */
15806 /* methods */
15807 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015808 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015809 0, /* tp_getattr */
15810 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015811 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015812 0, /* tp_repr */
15813 0, /* tp_as_number */
15814 0, /* tp_as_sequence */
15815 0, /* tp_as_mapping */
15816 0, /* tp_hash */
15817 0, /* tp_call */
15818 0, /* tp_str */
15819 PyObject_GenericGetAttr, /* tp_getattro */
15820 0, /* tp_setattro */
15821 0, /* tp_as_buffer */
15822 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15823 0, /* tp_doc */
15824 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15825 0, /* tp_clear */
15826 0, /* tp_richcompare */
15827 0, /* tp_weaklistoffset */
15828 PyObject_SelfIter, /* tp_iter */
15829 (iternextfunc)unicodeiter_next, /* tp_iternext */
15830 unicodeiter_methods, /* tp_methods */
15831 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015832};
15833
15834static PyObject *
15835unicode_iter(PyObject *seq)
15836{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015837 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015838
Benjamin Peterson14339b62009-01-31 16:36:08 +000015839 if (!PyUnicode_Check(seq)) {
15840 PyErr_BadInternalCall();
15841 return NULL;
15842 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015843 if (PyUnicode_READY(seq) == -1)
15844 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015845 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15846 if (it == NULL)
15847 return NULL;
15848 it->it_index = 0;
15849 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015850 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015851 _PyObject_GC_TRACK(it);
15852 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015853}
15854
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015855
15856size_t
15857Py_UNICODE_strlen(const Py_UNICODE *u)
15858{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015859 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015860}
15861
15862Py_UNICODE*
15863Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15864{
15865 Py_UNICODE *u = s1;
15866 while ((*u++ = *s2++));
15867 return s1;
15868}
15869
15870Py_UNICODE*
15871Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15872{
15873 Py_UNICODE *u = s1;
15874 while ((*u++ = *s2++))
15875 if (n-- == 0)
15876 break;
15877 return s1;
15878}
15879
15880Py_UNICODE*
15881Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15882{
15883 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015884 u1 += wcslen(u1);
15885 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015886 return s1;
15887}
15888
15889int
15890Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15891{
15892 while (*s1 && *s2 && *s1 == *s2)
15893 s1++, s2++;
15894 if (*s1 && *s2)
15895 return (*s1 < *s2) ? -1 : +1;
15896 if (*s1)
15897 return 1;
15898 if (*s2)
15899 return -1;
15900 return 0;
15901}
15902
15903int
15904Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15905{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015906 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015907 for (; n != 0; n--) {
15908 u1 = *s1;
15909 u2 = *s2;
15910 if (u1 != u2)
15911 return (u1 < u2) ? -1 : +1;
15912 if (u1 == '\0')
15913 return 0;
15914 s1++;
15915 s2++;
15916 }
15917 return 0;
15918}
15919
15920Py_UNICODE*
15921Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15922{
15923 const Py_UNICODE *p;
15924 for (p = s; *p; p++)
15925 if (*p == c)
15926 return (Py_UNICODE*)p;
15927 return NULL;
15928}
15929
15930Py_UNICODE*
15931Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15932{
15933 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015934 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015935 while (p != s) {
15936 p--;
15937 if (*p == c)
15938 return (Py_UNICODE*)p;
15939 }
15940 return NULL;
15941}
Victor Stinner331ea922010-08-10 16:37:20 +000015942
Victor Stinner71133ff2010-09-01 23:43:53 +000015943Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015944PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015945{
Victor Stinner577db2c2011-10-11 22:12:48 +020015946 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015947 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015948
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015949 if (!PyUnicode_Check(unicode)) {
15950 PyErr_BadArgument();
15951 return NULL;
15952 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015953 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015954 if (u == NULL)
15955 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015956 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015957 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015958 PyErr_NoMemory();
15959 return NULL;
15960 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015961 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015962 size *= sizeof(Py_UNICODE);
15963 copy = PyMem_Malloc(size);
15964 if (copy == NULL) {
15965 PyErr_NoMemory();
15966 return NULL;
15967 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015968 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015969 return copy;
15970}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015971
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015972
Victor Stinner709d23d2019-05-02 14:56:30 -040015973static int
15974encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015975{
Victor Stinner709d23d2019-05-02 14:56:30 -040015976 int res;
15977 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15978 if (res == -2) {
15979 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15980 return -1;
15981 }
15982 if (res < 0) {
15983 PyErr_NoMemory();
15984 return -1;
15985 }
15986 return 0;
15987}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015988
Victor Stinner709d23d2019-05-02 14:56:30 -040015989
15990static int
15991config_get_codec_name(wchar_t **config_encoding)
15992{
15993 char *encoding;
15994 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15995 return -1;
15996 }
15997
15998 PyObject *name_obj = NULL;
15999 PyObject *codec = _PyCodec_Lookup(encoding);
16000 PyMem_RawFree(encoding);
16001
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016002 if (!codec)
16003 goto error;
16004
16005 name_obj = PyObject_GetAttrString(codec, "name");
16006 Py_CLEAR(codec);
16007 if (!name_obj) {
16008 goto error;
16009 }
16010
Victor Stinner709d23d2019-05-02 14:56:30 -040016011 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16012 Py_DECREF(name_obj);
16013 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016014 goto error;
16015 }
16016
Victor Stinner709d23d2019-05-02 14:56:30 -040016017 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16018 if (raw_wname == NULL) {
16019 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016020 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040016021 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016022 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016023
16024 PyMem_RawFree(*config_encoding);
16025 *config_encoding = raw_wname;
16026
16027 PyMem_Free(wname);
16028 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016029
16030error:
16031 Py_XDECREF(codec);
16032 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040016033 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016034}
16035
16036
Victor Stinner331a6a52019-05-27 16:39:22 +020016037static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016038init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016039{
Victor Stinner709d23d2019-05-02 14:56:30 -040016040 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016041 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(tstate->interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016042 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016043 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016044 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016045 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016046 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016047}
16048
16049
Victor Stinner709d23d2019-05-02 14:56:30 -040016050static int
16051init_fs_codec(PyInterpreterState *interp)
16052{
Victor Stinnerda7933e2020-04-13 03:04:28 +020016053 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016054
16055 _Py_error_handler error_handler;
16056 error_handler = get_error_handler_wide(config->filesystem_errors);
16057 if (error_handler == _Py_ERROR_UNKNOWN) {
16058 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
16059 return -1;
16060 }
16061
16062 char *encoding, *errors;
16063 if (encode_wstr_utf8(config->filesystem_encoding,
16064 &encoding,
16065 "filesystem_encoding") < 0) {
16066 return -1;
16067 }
16068
16069 if (encode_wstr_utf8(config->filesystem_errors,
16070 &errors,
16071 "filesystem_errors") < 0) {
16072 PyMem_RawFree(encoding);
16073 return -1;
16074 }
16075
Victor Stinner3d17c042020-05-14 01:48:38 +020016076 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16077 PyMem_RawFree(fs_codec->encoding);
16078 fs_codec->encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016079 /* encoding has been normalized by init_fs_encoding() */
Victor Stinner3d17c042020-05-14 01:48:38 +020016080 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16081 PyMem_RawFree(fs_codec->errors);
16082 fs_codec->errors = errors;
16083 fs_codec->error_handler = error_handler;
Victor Stinner709d23d2019-05-02 14:56:30 -040016084
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016085#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +020016086 assert(fs_codec->utf8 == 1);
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016087#endif
16088
Victor Stinner709d23d2019-05-02 14:56:30 -040016089 /* At this point, PyUnicode_EncodeFSDefault() and
16090 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16091 the C implementation of the filesystem encoding. */
16092
16093 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16094 global configuration variables. */
Victor Stinner3d17c042020-05-14 01:48:38 +020016095 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16096 fs_codec->errors) < 0) {
Victor Stinner709d23d2019-05-02 14:56:30 -040016097 PyErr_NoMemory();
16098 return -1;
16099 }
16100 return 0;
16101}
16102
16103
Victor Stinner331a6a52019-05-27 16:39:22 +020016104static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016105init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016106{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016107 PyInterpreterState *interp = tstate->interp;
16108
Victor Stinner709d23d2019-05-02 14:56:30 -040016109 /* Update the filesystem encoding to the normalized Python codec name.
16110 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16111 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016112 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016113 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016114 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016115 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016116 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016117 }
16118
Victor Stinner709d23d2019-05-02 14:56:30 -040016119 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016120 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016121 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016122 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016123}
16124
16125
Victor Stinner331a6a52019-05-27 16:39:22 +020016126PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020016127_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016128{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016129 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016130 if (_PyStatus_EXCEPTION(status)) {
16131 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016132 }
16133
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016134 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016135}
16136
16137
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016138static void
Victor Stinner3d17c042020-05-14 01:48:38 +020016139_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016140{
Victor Stinner3d17c042020-05-14 01:48:38 +020016141 PyMem_RawFree(fs_codec->encoding);
16142 fs_codec->encoding = NULL;
16143 fs_codec->utf8 = 0;
16144 PyMem_RawFree(fs_codec->errors);
16145 fs_codec->errors = NULL;
16146 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016147}
16148
16149
Victor Stinner709d23d2019-05-02 14:56:30 -040016150#ifdef MS_WINDOWS
16151int
16152_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16153{
Victor Stinner81a7be32020-04-14 15:14:01 +020016154 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016155 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016156
16157 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16158 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16159 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16160 if (encoding == NULL || errors == NULL) {
16161 PyMem_RawFree(encoding);
16162 PyMem_RawFree(errors);
16163 PyErr_NoMemory();
16164 return -1;
16165 }
16166
16167 PyMem_RawFree(config->filesystem_encoding);
16168 config->filesystem_encoding = encoding;
16169 PyMem_RawFree(config->filesystem_errors);
16170 config->filesystem_errors = errors;
16171
16172 return init_fs_codec(interp);
16173}
16174#endif
16175
16176
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016177void
Victor Stinner3d483342019-11-22 12:27:50 +010016178_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016179{
Victor Stinner3d483342019-11-22 12:27:50 +010016180 if (_Py_IsMainInterpreter(tstate)) {
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016181#if defined(WITH_VALGRIND) || defined(__INSURE__)
Victor Stinner3d483342019-11-22 12:27:50 +010016182 /* Insure++ is a memory analysis tool that aids in discovering
16183 * memory leaks and other memory problems. On Python exit, the
16184 * interned string dictionaries are flagged as being in use at exit
16185 * (which it is). Under normal circumstances, this is fine because
16186 * the memory will be automatically reclaimed by the system. Under
16187 * memory debugging, it's a huge source of useless noise, so we
16188 * trade off slower shutdown for less distraction in the memory
16189 * reports. -baw
16190 */
16191 unicode_release_interned();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016192#endif /* __INSURE__ */
16193
Victor Stinner3d483342019-11-22 12:27:50 +010016194 Py_CLEAR(unicode_empty);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016195
Victor Stinner607b1022020-05-05 18:50:30 +020016196#ifdef LATIN1_SINGLETONS
Victor Stinner3d483342019-11-22 12:27:50 +010016197 for (Py_ssize_t i = 0; i < 256; i++) {
16198 Py_CLEAR(unicode_latin1[i]);
16199 }
Victor Stinner607b1022020-05-05 18:50:30 +020016200#endif
Victor Stinnerd6fb53f2020-05-14 01:11:54 +020016201 unicode_clear_static_strings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016202 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016203
Victor Stinner3d17c042020-05-14 01:48:38 +020016204 _PyUnicode_FiniEncodings(&tstate->interp->unicode.fs_codec);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016205}
16206
16207
Georg Brandl66c221e2010-10-14 07:04:07 +000016208/* A _string module, to export formatter_parser and formatter_field_name_split
16209 to the string.Formatter class implemented in Python. */
16210
16211static PyMethodDef _string_methods[] = {
16212 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16213 METH_O, PyDoc_STR("split the argument as a field name")},
16214 {"formatter_parser", (PyCFunction) formatter_parser,
16215 METH_O, PyDoc_STR("parse the argument as a format string")},
16216 {NULL, NULL}
16217};
16218
16219static struct PyModuleDef _string_module = {
16220 PyModuleDef_HEAD_INIT,
16221 "_string",
16222 PyDoc_STR("string helper module"),
16223 0,
16224 _string_methods,
16225 NULL,
16226 NULL,
16227 NULL,
16228 NULL
16229};
16230
16231PyMODINIT_FUNC
16232PyInit__string(void)
16233{
16234 return PyModule_Create(&_string_module);
16235}
16236
16237
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016238#ifdef __cplusplus
16239}
16240#endif