blob: 28ec8f10dcb3d29536f70b6e5f69687abb26cbf9 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinnera15e2602020-04-08 02:01:56 +020043#include "pycore_abstract.h" // _PyIndex_Check()
Victor Stinner45876a92020-02-12 22:32:34 +010044#include "pycore_bytes_methods.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010045#include "pycore_fileutils.h"
Victor Stinner61691d82019-10-02 23:51:20 +020046#include "pycore_initconfig.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010047#include "pycore_object.h"
Victor Stinner61691d82019-10-02 23:51:20 +020048#include "pycore_pathconfig.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040049#include "pycore_pylifecycle.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010050#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000051#include "ucnhash.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070052#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000054#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000055#include <windows.h>
56#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000057
Victor Stinnerfecc4f22019-03-19 14:20:29 +010058/* Uncomment to display statistics on interned strings at exit when
59 using Valgrind or Insecure++. */
60/* #define INTERNED_STATS 1 */
61
62
Larry Hastings61272b72014-01-07 12:41:53 -080063/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090064class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080065[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090066/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
67
68/*[python input]
69class Py_UCS4_converter(CConverter):
70 type = 'Py_UCS4'
71 converter = 'convert_uc'
72
73 def converter_init(self):
74 if self.default is not unspecified:
75 self.c_default = ascii(self.default)
76 if len(self.c_default) > 4 or self.c_default[0] != "'":
77 self.c_default = hex(ord(self.default))
78
79[python start generated code]*/
80/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080081
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000082/* --- Globals ------------------------------------------------------------
83
Serhiy Storchaka05997252013-01-26 12:14:02 +020084NOTE: In the interpreter's initialization phase, some globals are currently
85 initialized dynamically as needed. In the process Unicode objects may
86 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Victor Stinner8faf8212011-12-08 22:14:11 +010095/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
96#define MAX_UNICODE 0x10ffff
97
Victor Stinner910337b2011-10-03 03:20:16 +020098#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020099# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200100#else
101# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
102#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200103
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104#define _PyUnicode_UTF8(op) \
105 (((PyCompactUnicodeObject*)(op))->utf8)
106#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200107 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200108 assert(PyUnicode_IS_READY(op)), \
109 PyUnicode_IS_COMPACT_ASCII(op) ? \
110 ((char*)((PyASCIIObject*)(op) + 1)) : \
111 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200112#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200113 (((PyCompactUnicodeObject*)(op))->utf8_length)
114#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200115 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200116 assert(PyUnicode_IS_READY(op)), \
117 PyUnicode_IS_COMPACT_ASCII(op) ? \
118 ((PyASCIIObject*)(op))->length : \
119 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200120#define _PyUnicode_WSTR(op) \
121 (((PyASCIIObject*)(op))->wstr)
122#define _PyUnicode_WSTR_LENGTH(op) \
123 (((PyCompactUnicodeObject*)(op))->wstr_length)
124#define _PyUnicode_LENGTH(op) \
125 (((PyASCIIObject *)(op))->length)
126#define _PyUnicode_STATE(op) \
127 (((PyASCIIObject *)(op))->state)
128#define _PyUnicode_HASH(op) \
129 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_KIND(op) \
131 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200132 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200133#define _PyUnicode_GET_LENGTH(op) \
134 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200135 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200136#define _PyUnicode_DATA_ANY(op) \
137 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200138
Victor Stinner910337b2011-10-03 03:20:16 +0200139#undef PyUnicode_READY
140#define PyUnicode_READY(op) \
141 (assert(_PyUnicode_CHECK(op)), \
142 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200143 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100144 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200145
Victor Stinnerc379ead2011-10-03 12:52:27 +0200146#define _PyUnicode_SHARE_UTF8(op) \
147 (assert(_PyUnicode_CHECK(op)), \
148 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
149 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
150#define _PyUnicode_SHARE_WSTR(op) \
151 (assert(_PyUnicode_CHECK(op)), \
152 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
153
Victor Stinner829c0ad2011-10-03 01:08:02 +0200154/* true if the Unicode object has an allocated UTF-8 memory block
155 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200156#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200157 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200158 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200159 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
160
Victor Stinner03490912011-10-03 23:45:12 +0200161/* true if the Unicode object has an allocated wstr memory block
162 (not shared with other data) */
163#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200164 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200165 (!PyUnicode_IS_READY(op) || \
166 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
167
Victor Stinner910337b2011-10-03 03:20:16 +0200168/* Generic helper macro to convert characters of different types.
169 from_type and to_type have to be valid type names, begin and end
170 are pointers to the source characters which should be of type
171 "from_type *". to is a pointer of type "to_type *" and points to the
172 buffer where the result characters are written to. */
173#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
174 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100175 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600176 const from_type *_iter = (const from_type *)(begin);\
177 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200178 Py_ssize_t n = (_end) - (_iter); \
179 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200180 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200181 while (_iter < (_unrolled_end)) { \
182 _to[0] = (to_type) _iter[0]; \
183 _to[1] = (to_type) _iter[1]; \
184 _to[2] = (to_type) _iter[2]; \
185 _to[3] = (to_type) _iter[3]; \
186 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200187 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200188 while (_iter < (_end)) \
189 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200190 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200191
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200192#ifdef MS_WINDOWS
193 /* On Windows, overallocate by 50% is the best factor */
194# define OVERALLOCATE_FACTOR 2
195#else
196 /* On Linux, overallocate by 25% is the best factor */
197# define OVERALLOCATE_FACTOR 4
198#endif
199
Walter Dörwald16807132007-05-25 13:52:07 +0000200/* This dictionary holds all interned unicode strings. Note that references
201 to strings in this dictionary are *not* counted in the string's ob_refcnt.
202 When the interned string reaches a refcnt of 0 the string deallocation
203 function will delete the reference from this dictionary.
204
205 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000206 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000207*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200208static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000209
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000210/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212
Serhiy Storchaka678db842013-01-26 12:16:36 +0200213#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200214 do { \
215 if (unicode_empty != NULL) \
216 Py_INCREF(unicode_empty); \
217 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200218 unicode_empty = PyUnicode_New(0, 0); \
219 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200220 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200221 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
222 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200223 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200224 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000225
Serhiy Storchaka678db842013-01-26 12:16:36 +0200226#define _Py_RETURN_UNICODE_EMPTY() \
227 do { \
228 _Py_INCREF_UNICODE_EMPTY(); \
229 return unicode_empty; \
230 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000231
Victor Stinner59423e32018-11-26 13:40:01 +0100232static inline void
233unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
234 Py_ssize_t start, Py_ssize_t length)
235{
236 assert(0 <= start);
237 assert(kind != PyUnicode_WCHAR_KIND);
238 switch (kind) {
239 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100240 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100241 Py_UCS1 ch = (unsigned char)value;
242 Py_UCS1 *to = (Py_UCS1 *)data + start;
243 memset(to, ch, length);
244 break;
245 }
246 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100247 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100248 Py_UCS2 ch = (Py_UCS2)value;
249 Py_UCS2 *to = (Py_UCS2 *)data + start;
250 const Py_UCS2 *end = to + length;
251 for (; to < end; ++to) *to = ch;
252 break;
253 }
254 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100255 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100256 Py_UCS4 ch = value;
257 Py_UCS4 * to = (Py_UCS4 *)data + start;
258 const Py_UCS4 *end = to + length;
259 for (; to < end; ++to) *to = ch;
260 break;
261 }
262 default: Py_UNREACHABLE();
263 }
264}
265
266
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200267/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700268static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200269_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900270static inline void
271_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400272static PyObject *
273unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
274 const char *errors);
275static PyObject *
276unicode_decode_utf8(const char *s, Py_ssize_t size,
277 _Py_error_handler error_handler, const char *errors,
278 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200279
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200280/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200281static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200282
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000283/* Single character Unicode strings in the Latin-1 range are being
284 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200285static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000286
Christian Heimes190d79e2008-01-30 11:58:22 +0000287/* Fast detection of the most frequent whitespace characters */
288const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000289 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000290/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000291/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000292/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000293/* case 0x000C: * FORM FEED */
294/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000295 0, 1, 1, 1, 1, 1, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000297/* case 0x001C: * FILE SEPARATOR */
298/* case 0x001D: * GROUP SEPARATOR */
299/* case 0x001E: * RECORD SEPARATOR */
300/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000302/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000303 1, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
306 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000307
Benjamin Peterson14339b62009-01-31 16:36:08 +0000308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0,
315 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000316};
317
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200318/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200319static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200320static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100321static int unicode_modifiable(PyObject *unicode);
322
Victor Stinnerfe226c02011-10-03 03:52:20 +0200323
Alexander Belopolsky40018472011-02-26 01:02:56 +0000324static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100325_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200326static PyObject *
327_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
328static PyObject *
329_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
330
331static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000332unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000333 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100334 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000335 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
336
Alexander Belopolsky40018472011-02-26 01:02:56 +0000337static void
338raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300339 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100340 PyObject *unicode,
341 Py_ssize_t startpos, Py_ssize_t endpos,
342 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000343
Christian Heimes190d79e2008-01-30 11:58:22 +0000344/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200345static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000346 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000347/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000348/* 0x000B, * LINE TABULATION */
349/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000350/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000351 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000352 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000353/* 0x001C, * FILE SEPARATOR */
354/* 0x001D, * GROUP SEPARATOR */
355/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000356 0, 0, 0, 0, 1, 1, 1, 0,
357 0, 0, 0, 0, 0, 0, 0, 0,
358 0, 0, 0, 0, 0, 0, 0, 0,
359 0, 0, 0, 0, 0, 0, 0, 0,
360 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000361
Benjamin Peterson14339b62009-01-31 16:36:08 +0000362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
365 0, 0, 0, 0, 0, 0, 0, 0,
366 0, 0, 0, 0, 0, 0, 0, 0,
367 0, 0, 0, 0, 0, 0, 0, 0,
368 0, 0, 0, 0, 0, 0, 0, 0,
369 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000370};
371
INADA Naoki3ae20562017-01-16 20:41:20 +0900372static int convert_uc(PyObject *obj, void *addr);
373
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300374#include "clinic/unicodeobject.c.h"
375
Victor Stinner3d4226a2018-08-29 22:21:32 +0200376_Py_error_handler
377_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200378{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200379 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200380 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200381 }
382 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200383 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200384 }
385 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200386 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200387 }
388 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200389 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200390 }
391 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200392 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200393 }
394 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200395 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200396 }
397 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200398 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200399 }
Victor Stinner50149202015-09-22 00:26:54 +0200400 return _Py_ERROR_OTHER;
401}
402
Victor Stinner709d23d2019-05-02 14:56:30 -0400403
404static _Py_error_handler
405get_error_handler_wide(const wchar_t *errors)
406{
407 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
408 return _Py_ERROR_STRICT;
409 }
410 if (wcscmp(errors, L"surrogateescape") == 0) {
411 return _Py_ERROR_SURROGATEESCAPE;
412 }
413 if (wcscmp(errors, L"replace") == 0) {
414 return _Py_ERROR_REPLACE;
415 }
416 if (wcscmp(errors, L"ignore") == 0) {
417 return _Py_ERROR_IGNORE;
418 }
419 if (wcscmp(errors, L"backslashreplace") == 0) {
420 return _Py_ERROR_BACKSLASHREPLACE;
421 }
422 if (wcscmp(errors, L"surrogatepass") == 0) {
423 return _Py_ERROR_SURROGATEPASS;
424 }
425 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
426 return _Py_ERROR_XMLCHARREFREPLACE;
427 }
428 return _Py_ERROR_OTHER;
429}
430
431
Victor Stinner22eb6892019-06-26 00:51:05 +0200432static inline int
433unicode_check_encoding_errors(const char *encoding, const char *errors)
434{
435 if (encoding == NULL && errors == NULL) {
436 return 0;
437 }
438
Victor Stinner81a7be32020-04-14 15:14:01 +0200439 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200440#ifndef Py_DEBUG
441 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200442 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200443 return 0;
444 }
445#else
446 /* Always check in debug mode */
447#endif
448
449 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
450 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
451 if (!interp->fs_codec.encoding) {
452 return 0;
453 }
454
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200455 /* Disable checks during Python finalization. For example, it allows to
456 call _PyObject_Dump() during finalization for debugging purpose. */
457 if (interp->finalizing) {
458 return 0;
459 }
460
Victor Stinner22eb6892019-06-26 00:51:05 +0200461 if (encoding != NULL) {
462 PyObject *handler = _PyCodec_Lookup(encoding);
463 if (handler == NULL) {
464 return -1;
465 }
466 Py_DECREF(handler);
467 }
468
469 if (errors != NULL) {
470 PyObject *handler = PyCodec_LookupError(errors);
471 if (handler == NULL) {
472 return -1;
473 }
474 Py_DECREF(handler);
475 }
476 return 0;
477}
478
479
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300480/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
481 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000482Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000483PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000484{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000485#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000486 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000487#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000488 /* This is actually an illegal character, so it should
489 not be passed to unichr. */
490 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000491#endif
492}
493
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200494int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100495_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200496{
Victor Stinner68762572019-10-07 18:42:01 +0200497#define CHECK(expr) \
498 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
499
Victor Stinner910337b2011-10-03 03:20:16 +0200500 PyASCIIObject *ascii;
501 unsigned int kind;
502
Victor Stinner68762572019-10-07 18:42:01 +0200503 assert(op != NULL);
504 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200505
506 ascii = (PyASCIIObject *)op;
507 kind = ascii->state.kind;
508
Victor Stinnera3b334d2011-10-03 13:53:37 +0200509 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200510 CHECK(kind == PyUnicode_1BYTE_KIND);
511 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200512 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200513 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200514 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200515 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200516
Victor Stinnera41463c2011-10-04 01:05:08 +0200517 if (ascii->state.compact == 1) {
518 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200519 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200520 || kind == PyUnicode_2BYTE_KIND
521 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200522 CHECK(ascii->state.ascii == 0);
523 CHECK(ascii->state.ready == 1);
524 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100525 }
526 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200527 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
528
529 data = unicode->data.any;
530 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200531 CHECK(ascii->length == 0);
532 CHECK(ascii->hash == -1);
533 CHECK(ascii->state.compact == 0);
534 CHECK(ascii->state.ascii == 0);
535 CHECK(ascii->state.ready == 0);
536 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
537 CHECK(ascii->wstr != NULL);
538 CHECK(data == NULL);
539 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200540 }
541 else {
Victor Stinner68762572019-10-07 18:42:01 +0200542 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200543 || kind == PyUnicode_2BYTE_KIND
544 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200545 CHECK(ascii->state.compact == 0);
546 CHECK(ascii->state.ready == 1);
547 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200548 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200549 CHECK(compact->utf8 == data);
550 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200551 }
552 else
Victor Stinner68762572019-10-07 18:42:01 +0200553 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200554 }
555 }
556 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200557 if (
558#if SIZEOF_WCHAR_T == 2
559 kind == PyUnicode_2BYTE_KIND
560#else
561 kind == PyUnicode_4BYTE_KIND
562#endif
563 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200564 {
Victor Stinner68762572019-10-07 18:42:01 +0200565 CHECK(ascii->wstr == data);
566 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200567 } else
Victor Stinner68762572019-10-07 18:42:01 +0200568 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200569 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200570
571 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200572 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200573 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200574 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200575 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200576
577 /* check that the best kind is used: O(n) operation */
578 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200579 Py_ssize_t i;
580 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300581 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200582 Py_UCS4 ch;
583
584 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200585 for (i=0; i < ascii->length; i++)
586 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200587 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200588 if (ch > maxchar)
589 maxchar = ch;
590 }
591 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100592 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200593 CHECK(maxchar >= 128);
594 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100595 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200596 else
Victor Stinner68762572019-10-07 18:42:01 +0200597 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200598 }
Victor Stinner77faf692011-11-20 18:56:05 +0100599 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200600 CHECK(maxchar >= 0x100);
601 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100602 }
603 else {
Victor Stinner68762572019-10-07 18:42:01 +0200604 CHECK(maxchar >= 0x10000);
605 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100606 }
Victor Stinner68762572019-10-07 18:42:01 +0200607 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200608 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400609 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200610
611#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400612}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200613
Victor Stinner910337b2011-10-03 03:20:16 +0200614
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100615static PyObject*
616unicode_result_wchar(PyObject *unicode)
617{
618#ifndef Py_DEBUG
619 Py_ssize_t len;
620
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100621 len = _PyUnicode_WSTR_LENGTH(unicode);
622 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100623 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200624 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100625 }
626
627 if (len == 1) {
628 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100629 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100630 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
631 Py_DECREF(unicode);
632 return latin1_char;
633 }
634 }
635
636 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200637 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100638 return NULL;
639 }
640#else
Victor Stinneraa771272012-10-04 02:32:58 +0200641 assert(Py_REFCNT(unicode) == 1);
642
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100643 /* don't make the result ready in debug mode to ensure that the caller
644 makes the string ready before using it */
645 assert(_PyUnicode_CheckConsistency(unicode, 1));
646#endif
647 return unicode;
648}
649
650static PyObject*
651unicode_result_ready(PyObject *unicode)
652{
653 Py_ssize_t length;
654
655 length = PyUnicode_GET_LENGTH(unicode);
656 if (length == 0) {
657 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100658 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200659 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100660 }
661 return unicode_empty;
662 }
663
664 if (length == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300665 const void *data = PyUnicode_DATA(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +0200666 int kind = PyUnicode_KIND(unicode);
667 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100668 if (ch < 256) {
669 PyObject *latin1_char = unicode_latin1[ch];
670 if (latin1_char != NULL) {
671 if (unicode != latin1_char) {
672 Py_INCREF(latin1_char);
673 Py_DECREF(unicode);
674 }
675 return latin1_char;
676 }
677 else {
678 assert(_PyUnicode_CheckConsistency(unicode, 1));
679 Py_INCREF(unicode);
680 unicode_latin1[ch] = unicode;
681 return unicode;
682 }
683 }
684 }
685
686 assert(_PyUnicode_CheckConsistency(unicode, 1));
687 return unicode;
688}
689
690static PyObject*
691unicode_result(PyObject *unicode)
692{
693 assert(_PyUnicode_CHECK(unicode));
694 if (PyUnicode_IS_READY(unicode))
695 return unicode_result_ready(unicode);
696 else
697 return unicode_result_wchar(unicode);
698}
699
Victor Stinnerc4b49542011-12-11 22:44:26 +0100700static PyObject*
701unicode_result_unchanged(PyObject *unicode)
702{
703 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500704 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100705 return NULL;
706 Py_INCREF(unicode);
707 return unicode;
708 }
709 else
710 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100711 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100712}
713
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200714/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
715 ASCII, Latin1, UTF-8, etc. */
716static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200717backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200718 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
719{
Victor Stinnerad771582015-10-09 12:38:53 +0200720 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200721 Py_UCS4 ch;
722 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300723 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200724
725 assert(PyUnicode_IS_READY(unicode));
726 kind = PyUnicode_KIND(unicode);
727 data = PyUnicode_DATA(unicode);
728
729 size = 0;
730 /* determine replacement size */
731 for (i = collstart; i < collend; ++i) {
732 Py_ssize_t incr;
733
734 ch = PyUnicode_READ(kind, data, i);
735 if (ch < 0x100)
736 incr = 2+2;
737 else if (ch < 0x10000)
738 incr = 2+4;
739 else {
740 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200741 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200742 }
743 if (size > PY_SSIZE_T_MAX - incr) {
744 PyErr_SetString(PyExc_OverflowError,
745 "encoded result is too long for a Python string");
746 return NULL;
747 }
748 size += incr;
749 }
750
Victor Stinnerad771582015-10-09 12:38:53 +0200751 str = _PyBytesWriter_Prepare(writer, str, size);
752 if (str == NULL)
753 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200754
755 /* generate replacement */
756 for (i = collstart; i < collend; ++i) {
757 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200758 *str++ = '\\';
759 if (ch >= 0x00010000) {
760 *str++ = 'U';
761 *str++ = Py_hexdigits[(ch>>28)&0xf];
762 *str++ = Py_hexdigits[(ch>>24)&0xf];
763 *str++ = Py_hexdigits[(ch>>20)&0xf];
764 *str++ = Py_hexdigits[(ch>>16)&0xf];
765 *str++ = Py_hexdigits[(ch>>12)&0xf];
766 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200767 }
Victor Stinner797485e2015-10-09 03:17:30 +0200768 else if (ch >= 0x100) {
769 *str++ = 'u';
770 *str++ = Py_hexdigits[(ch>>12)&0xf];
771 *str++ = Py_hexdigits[(ch>>8)&0xf];
772 }
773 else
774 *str++ = 'x';
775 *str++ = Py_hexdigits[(ch>>4)&0xf];
776 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200777 }
778 return str;
779}
780
781/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
782 ASCII, Latin1, UTF-8, etc. */
783static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200784xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200785 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
786{
Victor Stinnerad771582015-10-09 12:38:53 +0200787 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200788 Py_UCS4 ch;
789 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300790 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200791
792 assert(PyUnicode_IS_READY(unicode));
793 kind = PyUnicode_KIND(unicode);
794 data = PyUnicode_DATA(unicode);
795
796 size = 0;
797 /* determine replacement size */
798 for (i = collstart; i < collend; ++i) {
799 Py_ssize_t incr;
800
801 ch = PyUnicode_READ(kind, data, i);
802 if (ch < 10)
803 incr = 2+1+1;
804 else if (ch < 100)
805 incr = 2+2+1;
806 else if (ch < 1000)
807 incr = 2+3+1;
808 else if (ch < 10000)
809 incr = 2+4+1;
810 else if (ch < 100000)
811 incr = 2+5+1;
812 else if (ch < 1000000)
813 incr = 2+6+1;
814 else {
815 assert(ch <= MAX_UNICODE);
816 incr = 2+7+1;
817 }
818 if (size > PY_SSIZE_T_MAX - incr) {
819 PyErr_SetString(PyExc_OverflowError,
820 "encoded result is too long for a Python string");
821 return NULL;
822 }
823 size += incr;
824 }
825
Victor Stinnerad771582015-10-09 12:38:53 +0200826 str = _PyBytesWriter_Prepare(writer, str, size);
827 if (str == NULL)
828 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200829
830 /* generate replacement */
831 for (i = collstart; i < collend; ++i) {
832 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
833 }
834 return str;
835}
836
Thomas Wouters477c8d52006-05-27 19:21:47 +0000837/* --- Bloom Filters ----------------------------------------------------- */
838
839/* stuff to implement simple "bloom filters" for Unicode characters.
840 to keep things simple, we use a single bitmask, using the least 5
841 bits from each unicode characters as the bit index. */
842
843/* the linebreak mask is set up by Unicode_Init below */
844
Antoine Pitrouf068f942010-01-13 14:19:12 +0000845#if LONG_BIT >= 128
846#define BLOOM_WIDTH 128
847#elif LONG_BIT >= 64
848#define BLOOM_WIDTH 64
849#elif LONG_BIT >= 32
850#define BLOOM_WIDTH 32
851#else
852#error "LONG_BIT is smaller than 32"
853#endif
854
Thomas Wouters477c8d52006-05-27 19:21:47 +0000855#define BLOOM_MASK unsigned long
856
Serhiy Storchaka05997252013-01-26 12:14:02 +0200857static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000858
Antoine Pitrouf068f942010-01-13 14:19:12 +0000859#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000860
Benjamin Peterson29060642009-01-31 22:14:21 +0000861#define BLOOM_LINEBREAK(ch) \
862 ((ch) < 128U ? ascii_linebreak[(ch)] : \
863 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000864
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700865static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300866make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000867{
Victor Stinnera85af502013-04-09 21:53:54 +0200868#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
869 do { \
870 TYPE *data = (TYPE *)PTR; \
871 TYPE *end = data + LEN; \
872 Py_UCS4 ch; \
873 for (; data != end; data++) { \
874 ch = *data; \
875 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
876 } \
877 break; \
878 } while (0)
879
Thomas Wouters477c8d52006-05-27 19:21:47 +0000880 /* calculate simple bloom-style bitmask for a given unicode string */
881
Antoine Pitrouf068f942010-01-13 14:19:12 +0000882 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000883
884 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200885 switch (kind) {
886 case PyUnicode_1BYTE_KIND:
887 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
888 break;
889 case PyUnicode_2BYTE_KIND:
890 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
891 break;
892 case PyUnicode_4BYTE_KIND:
893 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
894 break;
895 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700896 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200897 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000898 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200899
900#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000901}
902
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300903static int
904ensure_unicode(PyObject *obj)
905{
906 if (!PyUnicode_Check(obj)) {
907 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200908 "must be str, not %.100s",
909 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300910 return -1;
911 }
912 return PyUnicode_READY(obj);
913}
914
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200915/* Compilation of templated routines */
916
917#include "stringlib/asciilib.h"
918#include "stringlib/fastsearch.h"
919#include "stringlib/partition.h"
920#include "stringlib/split.h"
921#include "stringlib/count.h"
922#include "stringlib/find.h"
923#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200924#include "stringlib/undef.h"
925
926#include "stringlib/ucs1lib.h"
927#include "stringlib/fastsearch.h"
928#include "stringlib/partition.h"
929#include "stringlib/split.h"
930#include "stringlib/count.h"
931#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300932#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200933#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200934#include "stringlib/undef.h"
935
936#include "stringlib/ucs2lib.h"
937#include "stringlib/fastsearch.h"
938#include "stringlib/partition.h"
939#include "stringlib/split.h"
940#include "stringlib/count.h"
941#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300942#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200943#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200944#include "stringlib/undef.h"
945
946#include "stringlib/ucs4lib.h"
947#include "stringlib/fastsearch.h"
948#include "stringlib/partition.h"
949#include "stringlib/split.h"
950#include "stringlib/count.h"
951#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300952#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200953#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200954#include "stringlib/undef.h"
955
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200956#include "stringlib/unicodedefs.h"
957#include "stringlib/fastsearch.h"
958#include "stringlib/count.h"
959#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100960#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200961
Guido van Rossumd57fd912000-03-10 22:53:23 +0000962/* --- Unicode Object ----------------------------------------------------- */
963
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700964static inline Py_ssize_t
965findchar(const void *s, int kind,
966 Py_ssize_t size, Py_UCS4 ch,
967 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200969 switch (kind) {
970 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200971 if ((Py_UCS1) ch != ch)
972 return -1;
973 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600974 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200975 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600976 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200977 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200978 if ((Py_UCS2) ch != ch)
979 return -1;
980 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600981 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200982 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600983 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200984 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200985 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600986 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200987 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600988 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200989 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700990 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200991 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200992}
993
Victor Stinnerafffce42012-10-03 23:03:17 +0200994#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000995/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200996 earlier.
997
998 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
999 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1000 invalid character in Unicode 6.0. */
1001static void
1002unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1003{
1004 int kind = PyUnicode_KIND(unicode);
1005 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1006 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1007 if (length <= old_length)
1008 return;
1009 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1010}
1011#endif
1012
Victor Stinnerfe226c02011-10-03 03:52:20 +02001013static PyObject*
1014resize_compact(PyObject *unicode, Py_ssize_t length)
1015{
1016 Py_ssize_t char_size;
1017 Py_ssize_t struct_size;
1018 Py_ssize_t new_size;
1019 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001020 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001021#ifdef Py_DEBUG
1022 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1023#endif
1024
Victor Stinner79891572012-05-03 13:43:07 +02001025 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001026 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001027 assert(PyUnicode_IS_COMPACT(unicode));
1028
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001029 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001030 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001031 struct_size = sizeof(PyASCIIObject);
1032 else
1033 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001034 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001035
Victor Stinnerfe226c02011-10-03 03:52:20 +02001036 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1037 PyErr_NoMemory();
1038 return NULL;
1039 }
1040 new_size = (struct_size + (length + 1) * char_size);
1041
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001042 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1043 PyObject_DEL(_PyUnicode_UTF8(unicode));
1044 _PyUnicode_UTF8(unicode) = NULL;
1045 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1046 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001047#ifdef Py_REF_DEBUG
1048 _Py_RefTotal--;
1049#endif
1050#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001051 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001052#endif
Victor Stinner84def372011-12-11 20:04:56 +01001053
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001054 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001055 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001056 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001057 PyErr_NoMemory();
1058 return NULL;
1059 }
Victor Stinner84def372011-12-11 20:04:56 +01001060 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001061 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001062
Victor Stinnerfe226c02011-10-03 03:52:20 +02001063 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001064 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001065 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001066 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001067 _PyUnicode_WSTR_LENGTH(unicode) = length;
1068 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001069 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1070 PyObject_DEL(_PyUnicode_WSTR(unicode));
1071 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001072 if (!PyUnicode_IS_ASCII(unicode))
1073 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001074 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001075#ifdef Py_DEBUG
1076 unicode_fill_invalid(unicode, old_length);
1077#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1079 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001080 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001081 return unicode;
1082}
1083
Alexander Belopolsky40018472011-02-26 01:02:56 +00001084static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001085resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086{
Victor Stinner95663112011-10-04 01:03:50 +02001087 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001088 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001089 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001090 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001091
Victor Stinnerfe226c02011-10-03 03:52:20 +02001092 if (PyUnicode_IS_READY(unicode)) {
1093 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001094 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001095 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001096#ifdef Py_DEBUG
1097 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1098#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001099
1100 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001101 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001102 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1103 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001104
1105 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1106 PyErr_NoMemory();
1107 return -1;
1108 }
1109 new_size = (length + 1) * char_size;
1110
Victor Stinner7a9105a2011-12-12 00:13:42 +01001111 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1112 {
1113 PyObject_DEL(_PyUnicode_UTF8(unicode));
1114 _PyUnicode_UTF8(unicode) = NULL;
1115 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1116 }
1117
Victor Stinnerfe226c02011-10-03 03:52:20 +02001118 data = (PyObject *)PyObject_REALLOC(data, new_size);
1119 if (data == NULL) {
1120 PyErr_NoMemory();
1121 return -1;
1122 }
1123 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001124 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001125 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001126 _PyUnicode_WSTR_LENGTH(unicode) = length;
1127 }
1128 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001129 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001130 _PyUnicode_UTF8_LENGTH(unicode) = length;
1131 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001132 _PyUnicode_LENGTH(unicode) = length;
1133 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001134#ifdef Py_DEBUG
1135 unicode_fill_invalid(unicode, old_length);
1136#endif
Victor Stinner95663112011-10-04 01:03:50 +02001137 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001138 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001139 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001140 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001141 }
Victor Stinner95663112011-10-04 01:03:50 +02001142 assert(_PyUnicode_WSTR(unicode) != NULL);
1143
1144 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001145 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001146 PyErr_NoMemory();
1147 return -1;
1148 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001149 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001150 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001151 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001152 if (!wstr) {
1153 PyErr_NoMemory();
1154 return -1;
1155 }
1156 _PyUnicode_WSTR(unicode) = wstr;
1157 _PyUnicode_WSTR(unicode)[length] = 0;
1158 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001159 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001160 return 0;
1161}
1162
Victor Stinnerfe226c02011-10-03 03:52:20 +02001163static PyObject*
1164resize_copy(PyObject *unicode, Py_ssize_t length)
1165{
1166 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001167 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001168 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001169
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001170 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001171
1172 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1173 if (copy == NULL)
1174 return NULL;
1175
1176 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001177 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001178 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001179 }
1180 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001181 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001182
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001183 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001184 if (w == NULL)
1185 return NULL;
1186 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1187 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001188 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001189 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001190 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001191 }
1192}
1193
Guido van Rossumd57fd912000-03-10 22:53:23 +00001194/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001195 Ux0000 terminated; some code (e.g. new_identifier)
1196 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001197
1198 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001199 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001200
1201*/
1202
Alexander Belopolsky40018472011-02-26 01:02:56 +00001203static PyUnicodeObject *
1204_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001206 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001207 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208
Thomas Wouters477c8d52006-05-27 19:21:47 +00001209 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001210 if (length == 0 && unicode_empty != NULL) {
1211 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001212 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213 }
1214
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001215 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001216 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001217 return (PyUnicodeObject *)PyErr_NoMemory();
1218 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001219 if (length < 0) {
1220 PyErr_SetString(PyExc_SystemError,
1221 "Negative size passed to _PyUnicode_New");
1222 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223 }
1224
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001225 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1226 if (unicode == NULL)
1227 return NULL;
1228 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001229
1230 _PyUnicode_WSTR_LENGTH(unicode) = length;
1231 _PyUnicode_HASH(unicode) = -1;
1232 _PyUnicode_STATE(unicode).interned = 0;
1233 _PyUnicode_STATE(unicode).kind = 0;
1234 _PyUnicode_STATE(unicode).compact = 0;
1235 _PyUnicode_STATE(unicode).ready = 0;
1236 _PyUnicode_STATE(unicode).ascii = 0;
1237 _PyUnicode_DATA_ANY(unicode) = NULL;
1238 _PyUnicode_LENGTH(unicode) = 0;
1239 _PyUnicode_UTF8(unicode) = NULL;
1240 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1243 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001244 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001245 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001246 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001248
Jeremy Hyltond8082792003-09-16 19:41:39 +00001249 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001250 * the caller fails before initializing str -- unicode_resize()
1251 * reads str[0], and the Keep-Alive optimization can keep memory
1252 * allocated for str alive across a call to unicode_dealloc(unicode).
1253 * We don't want unicode_resize to read uninitialized memory in
1254 * that case.
1255 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001256 _PyUnicode_WSTR(unicode)[0] = 0;
1257 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001258
Victor Stinner7931d9a2011-11-04 00:22:48 +01001259 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001260 return unicode;
1261}
1262
Victor Stinnerf42dc442011-10-02 23:33:16 +02001263static const char*
1264unicode_kind_name(PyObject *unicode)
1265{
Victor Stinner42dfd712011-10-03 14:41:45 +02001266 /* don't check consistency: unicode_kind_name() is called from
1267 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001268 if (!PyUnicode_IS_COMPACT(unicode))
1269 {
1270 if (!PyUnicode_IS_READY(unicode))
1271 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001272 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001273 {
1274 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001275 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001276 return "legacy ascii";
1277 else
1278 return "legacy latin1";
1279 case PyUnicode_2BYTE_KIND:
1280 return "legacy UCS2";
1281 case PyUnicode_4BYTE_KIND:
1282 return "legacy UCS4";
1283 default:
1284 return "<legacy invalid kind>";
1285 }
1286 }
1287 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001288 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001289 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001290 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001291 return "ascii";
1292 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001293 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001294 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001295 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001296 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001297 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001298 default:
1299 return "<invalid compact kind>";
1300 }
1301}
1302
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001303#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001304/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001305const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001306 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001307 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001308}
1309
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001310const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001311 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312 return _PyUnicode_COMPACT_DATA(unicode);
1313}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001314const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001315 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001316 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1318 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1319 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1320 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1321 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1322 return PyUnicode_DATA(unicode);
1323}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001324
1325void
1326_PyUnicode_Dump(PyObject *op)
1327{
1328 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001329 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1330 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001331 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001332
Victor Stinnera849a4b2011-10-03 12:12:11 +02001333 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001334 {
1335 if (ascii->state.ascii)
1336 data = (ascii + 1);
1337 else
1338 data = (compact + 1);
1339 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001340 else
1341 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001342 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1343 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001344
Victor Stinnera849a4b2011-10-03 12:12:11 +02001345 if (ascii->wstr == data)
1346 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001347 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001348
Victor Stinnera3b334d2011-10-03 13:53:37 +02001349 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001350 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001351 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1352 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001353 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001354 (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001355 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001356 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001357}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001358#endif
1359
1360PyObject *
1361PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1362{
1363 PyObject *obj;
1364 PyCompactUnicodeObject *unicode;
1365 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001366 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001367 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 Py_ssize_t char_size;
1369 Py_ssize_t struct_size;
1370
1371 /* Optimization for empty strings */
1372 if (size == 0 && unicode_empty != NULL) {
1373 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001374 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375 }
1376
Victor Stinner9e9d6892011-10-04 01:02:02 +02001377 is_ascii = 0;
1378 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001379 struct_size = sizeof(PyCompactUnicodeObject);
1380 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001381 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382 char_size = 1;
1383 is_ascii = 1;
1384 struct_size = sizeof(PyASCIIObject);
1385 }
1386 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001387 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001388 char_size = 1;
1389 }
1390 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001391 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392 char_size = 2;
1393 if (sizeof(wchar_t) == 2)
1394 is_sharing = 1;
1395 }
1396 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001397 if (maxchar > MAX_UNICODE) {
1398 PyErr_SetString(PyExc_SystemError,
1399 "invalid maximum character passed to PyUnicode_New");
1400 return NULL;
1401 }
Victor Stinner8f825062012-04-27 13:55:39 +02001402 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403 char_size = 4;
1404 if (sizeof(wchar_t) == 4)
1405 is_sharing = 1;
1406 }
1407
1408 /* Ensure we won't overflow the size. */
1409 if (size < 0) {
1410 PyErr_SetString(PyExc_SystemError,
1411 "Negative size passed to PyUnicode_New");
1412 return NULL;
1413 }
1414 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1415 return PyErr_NoMemory();
1416
1417 /* Duplicated allocation code from _PyObject_New() instead of a call to
1418 * PyObject_New() so we are able to allocate space for the object and
1419 * it's data buffer.
1420 */
1421 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1422 if (obj == NULL)
1423 return PyErr_NoMemory();
1424 obj = PyObject_INIT(obj, &PyUnicode_Type);
1425 if (obj == NULL)
1426 return NULL;
1427
1428 unicode = (PyCompactUnicodeObject *)obj;
1429 if (is_ascii)
1430 data = ((PyASCIIObject*)obj) + 1;
1431 else
1432 data = unicode + 1;
1433 _PyUnicode_LENGTH(unicode) = size;
1434 _PyUnicode_HASH(unicode) = -1;
1435 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001436 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 _PyUnicode_STATE(unicode).compact = 1;
1438 _PyUnicode_STATE(unicode).ready = 1;
1439 _PyUnicode_STATE(unicode).ascii = is_ascii;
1440 if (is_ascii) {
1441 ((char*)data)[size] = 0;
1442 _PyUnicode_WSTR(unicode) = NULL;
1443 }
Victor Stinner8f825062012-04-27 13:55:39 +02001444 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001445 ((char*)data)[size] = 0;
1446 _PyUnicode_WSTR(unicode) = NULL;
1447 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001448 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001449 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001450 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001451 else {
1452 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001453 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001454 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001455 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001456 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 ((Py_UCS4*)data)[size] = 0;
1458 if (is_sharing) {
1459 _PyUnicode_WSTR_LENGTH(unicode) = size;
1460 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1461 }
1462 else {
1463 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1464 _PyUnicode_WSTR(unicode) = NULL;
1465 }
1466 }
Victor Stinner8f825062012-04-27 13:55:39 +02001467#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001468 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001469#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001470 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471 return obj;
1472}
1473
1474#if SIZEOF_WCHAR_T == 2
1475/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1476 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001477 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478
1479 This function assumes that unicode can hold one more code point than wstr
1480 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001481static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001482unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001483 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001484{
1485 const wchar_t *iter;
1486 Py_UCS4 *ucs4_out;
1487
Victor Stinner910337b2011-10-03 03:20:16 +02001488 assert(unicode != NULL);
1489 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1491 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1492
1493 for (iter = begin; iter < end; ) {
1494 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1495 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001496 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1497 && (iter+1) < end
1498 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001499 {
Victor Stinner551ac952011-11-29 22:58:13 +01001500 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001501 iter += 2;
1502 }
1503 else {
1504 *ucs4_out++ = *iter;
1505 iter++;
1506 }
1507 }
1508 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1509 _PyUnicode_GET_LENGTH(unicode)));
1510
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511}
1512#endif
1513
Victor Stinnercd9950f2011-10-02 00:34:53 +02001514static int
Victor Stinner488fa492011-12-12 00:01:39 +01001515unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001516{
Victor Stinner488fa492011-12-12 00:01:39 +01001517 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001518 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001519 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001520 return -1;
1521 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001522 return 0;
1523}
1524
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001525static int
1526_copy_characters(PyObject *to, Py_ssize_t to_start,
1527 PyObject *from, Py_ssize_t from_start,
1528 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001529{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001530 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001531 const void *from_data;
1532 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001533
Victor Stinneree4544c2012-05-09 22:24:08 +02001534 assert(0 <= how_many);
1535 assert(0 <= from_start);
1536 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001537 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001538 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001539 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001540
Victor Stinnerd3f08822012-05-29 12:57:52 +02001541 assert(PyUnicode_Check(to));
1542 assert(PyUnicode_IS_READY(to));
1543 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1544
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001545 if (how_many == 0)
1546 return 0;
1547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001548 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001549 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001550 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001551 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001552
Victor Stinnerf1852262012-06-16 16:38:26 +02001553#ifdef Py_DEBUG
1554 if (!check_maxchar
1555 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1556 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001557 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001558 Py_UCS4 ch;
1559 Py_ssize_t i;
1560 for (i=0; i < how_many; i++) {
1561 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1562 assert(ch <= to_maxchar);
1563 }
1564 }
1565#endif
1566
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001567 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001568 if (check_maxchar
1569 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1570 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001571 /* Writing Latin-1 characters into an ASCII string requires to
1572 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001573 Py_UCS4 max_char;
1574 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001575 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001576 if (max_char >= 128)
1577 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001578 }
Christian Heimesf051e432016-09-13 20:22:02 +02001579 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001580 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001581 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001582 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001583 else if (from_kind == PyUnicode_1BYTE_KIND
1584 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001585 {
1586 _PyUnicode_CONVERT_BYTES(
1587 Py_UCS1, Py_UCS2,
1588 PyUnicode_1BYTE_DATA(from) + from_start,
1589 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1590 PyUnicode_2BYTE_DATA(to) + to_start
1591 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001592 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001593 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001594 && to_kind == PyUnicode_4BYTE_KIND)
1595 {
1596 _PyUnicode_CONVERT_BYTES(
1597 Py_UCS1, Py_UCS4,
1598 PyUnicode_1BYTE_DATA(from) + from_start,
1599 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1600 PyUnicode_4BYTE_DATA(to) + to_start
1601 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001602 }
1603 else if (from_kind == PyUnicode_2BYTE_KIND
1604 && to_kind == PyUnicode_4BYTE_KIND)
1605 {
1606 _PyUnicode_CONVERT_BYTES(
1607 Py_UCS2, Py_UCS4,
1608 PyUnicode_2BYTE_DATA(from) + from_start,
1609 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1610 PyUnicode_4BYTE_DATA(to) + to_start
1611 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001612 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001613 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001614 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1615
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001616 if (!check_maxchar) {
1617 if (from_kind == PyUnicode_2BYTE_KIND
1618 && to_kind == PyUnicode_1BYTE_KIND)
1619 {
1620 _PyUnicode_CONVERT_BYTES(
1621 Py_UCS2, Py_UCS1,
1622 PyUnicode_2BYTE_DATA(from) + from_start,
1623 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1624 PyUnicode_1BYTE_DATA(to) + to_start
1625 );
1626 }
1627 else if (from_kind == PyUnicode_4BYTE_KIND
1628 && to_kind == PyUnicode_1BYTE_KIND)
1629 {
1630 _PyUnicode_CONVERT_BYTES(
1631 Py_UCS4, Py_UCS1,
1632 PyUnicode_4BYTE_DATA(from) + from_start,
1633 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1634 PyUnicode_1BYTE_DATA(to) + to_start
1635 );
1636 }
1637 else if (from_kind == PyUnicode_4BYTE_KIND
1638 && to_kind == PyUnicode_2BYTE_KIND)
1639 {
1640 _PyUnicode_CONVERT_BYTES(
1641 Py_UCS4, Py_UCS2,
1642 PyUnicode_4BYTE_DATA(from) + from_start,
1643 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1644 PyUnicode_2BYTE_DATA(to) + to_start
1645 );
1646 }
1647 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001648 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001649 }
1650 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001651 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001652 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001653 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001654 Py_ssize_t i;
1655
Victor Stinnera0702ab2011-09-29 14:14:38 +02001656 for (i=0; i < how_many; i++) {
1657 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001658 if (ch > to_maxchar)
1659 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001660 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1661 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001662 }
1663 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001664 return 0;
1665}
1666
Victor Stinnerd3f08822012-05-29 12:57:52 +02001667void
1668_PyUnicode_FastCopyCharacters(
1669 PyObject *to, Py_ssize_t to_start,
1670 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001671{
1672 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1673}
1674
1675Py_ssize_t
1676PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1677 PyObject *from, Py_ssize_t from_start,
1678 Py_ssize_t how_many)
1679{
1680 int err;
1681
1682 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1683 PyErr_BadInternalCall();
1684 return -1;
1685 }
1686
Benjamin Petersonbac79492012-01-14 13:34:47 -05001687 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001688 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001689 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001690 return -1;
1691
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001692 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001693 PyErr_SetString(PyExc_IndexError, "string index out of range");
1694 return -1;
1695 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001696 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001697 PyErr_SetString(PyExc_IndexError, "string index out of range");
1698 return -1;
1699 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001700 if (how_many < 0) {
1701 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1702 return -1;
1703 }
1704 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001705 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1706 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001707 "Cannot write %zi characters at %zi "
1708 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001709 how_many, to_start, PyUnicode_GET_LENGTH(to));
1710 return -1;
1711 }
1712
1713 if (how_many == 0)
1714 return 0;
1715
Victor Stinner488fa492011-12-12 00:01:39 +01001716 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001717 return -1;
1718
1719 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1720 if (err) {
1721 PyErr_Format(PyExc_SystemError,
1722 "Cannot copy %s characters "
1723 "into a string of %s characters",
1724 unicode_kind_name(from),
1725 unicode_kind_name(to));
1726 return -1;
1727 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001728 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729}
1730
Victor Stinner17222162011-09-28 22:15:37 +02001731/* Find the maximum code point and count the number of surrogate pairs so a
1732 correct string length can be computed before converting a string to UCS4.
1733 This function counts single surrogates as a character and not as a pair.
1734
1735 Return 0 on success, or -1 on error. */
1736static int
1737find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1738 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001739{
1740 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001741 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001742
Victor Stinnerc53be962011-10-02 21:33:54 +02001743 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 *num_surrogates = 0;
1745 *maxchar = 0;
1746
1747 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001748#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001749 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1750 && (iter+1) < end
1751 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1752 {
1753 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1754 ++(*num_surrogates);
1755 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 }
1757 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001759 {
1760 ch = *iter;
1761 iter++;
1762 }
1763 if (ch > *maxchar) {
1764 *maxchar = ch;
1765 if (*maxchar > MAX_UNICODE) {
1766 PyErr_Format(PyExc_ValueError,
1767 "character U+%x is not in range [U+0000; U+10ffff]",
1768 ch);
1769 return -1;
1770 }
1771 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772 }
1773 return 0;
1774}
1775
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001776int
1777_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001778{
1779 wchar_t *end;
1780 Py_UCS4 maxchar = 0;
1781 Py_ssize_t num_surrogates;
1782#if SIZEOF_WCHAR_T == 2
1783 Py_ssize_t length_wo_surrogates;
1784#endif
1785
Georg Brandl7597add2011-10-05 16:36:47 +02001786 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001787 strings were created using _PyObject_New() and where no canonical
1788 representation (the str field) has been set yet aka strings
1789 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001790 assert(_PyUnicode_CHECK(unicode));
1791 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001792 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001793 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001794 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001795 /* Actually, it should neither be interned nor be anything else: */
1796 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001798 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001799 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001800 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001801 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802
1803 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001804 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1805 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001806 PyErr_NoMemory();
1807 return -1;
1808 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001809 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001810 _PyUnicode_WSTR(unicode), end,
1811 PyUnicode_1BYTE_DATA(unicode));
1812 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1813 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1814 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1815 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001816 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001817 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001818 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001819 }
1820 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001821 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001822 _PyUnicode_UTF8(unicode) = NULL;
1823 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001824 }
1825 PyObject_FREE(_PyUnicode_WSTR(unicode));
1826 _PyUnicode_WSTR(unicode) = NULL;
1827 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1828 }
1829 /* In this case we might have to convert down from 4-byte native
1830 wchar_t to 2-byte unicode. */
1831 else if (maxchar < 65536) {
1832 assert(num_surrogates == 0 &&
1833 "FindMaxCharAndNumSurrogatePairs() messed up");
1834
Victor Stinner506f5922011-09-28 22:34:18 +02001835#if SIZEOF_WCHAR_T == 2
1836 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001837 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001838 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1839 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1840 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001841 _PyUnicode_UTF8(unicode) = NULL;
1842 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001843#else
1844 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001845 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001846 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001847 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001848 PyErr_NoMemory();
1849 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001850 }
Victor Stinner506f5922011-09-28 22:34:18 +02001851 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1852 _PyUnicode_WSTR(unicode), end,
1853 PyUnicode_2BYTE_DATA(unicode));
1854 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1855 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1856 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001857 _PyUnicode_UTF8(unicode) = NULL;
1858 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001859 PyObject_FREE(_PyUnicode_WSTR(unicode));
1860 _PyUnicode_WSTR(unicode) = NULL;
1861 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1862#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001863 }
1864 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1865 else {
1866#if SIZEOF_WCHAR_T == 2
1867 /* in case the native representation is 2-bytes, we need to allocate a
1868 new normalized 4-byte version. */
1869 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001870 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1871 PyErr_NoMemory();
1872 return -1;
1873 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001874 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1875 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001876 PyErr_NoMemory();
1877 return -1;
1878 }
1879 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1880 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001881 _PyUnicode_UTF8(unicode) = NULL;
1882 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001883 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1884 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001885 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001886 PyObject_FREE(_PyUnicode_WSTR(unicode));
1887 _PyUnicode_WSTR(unicode) = NULL;
1888 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1889#else
1890 assert(num_surrogates == 0);
1891
Victor Stinnerc3c74152011-10-02 20:39:55 +02001892 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001893 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001894 _PyUnicode_UTF8(unicode) = NULL;
1895 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001896 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1897#endif
1898 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1899 }
1900 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001901 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001902 return 0;
1903}
1904
Alexander Belopolsky40018472011-02-26 01:02:56 +00001905static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001906unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001907{
Walter Dörwald16807132007-05-25 13:52:07 +00001908 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001909 case SSTATE_NOT_INTERNED:
1910 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001911
Benjamin Peterson29060642009-01-31 22:14:21 +00001912 case SSTATE_INTERNED_MORTAL:
1913 /* revive dead object temporarily for DelItem */
Victor Stinnerc86a1122020-02-07 01:24:29 +01001914 Py_SET_REFCNT(unicode, 3);
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001915 if (PyDict_DelItem(interned, unicode) != 0) {
1916 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1917 NULL);
1918 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001919 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001920
Benjamin Peterson29060642009-01-31 22:14:21 +00001921 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001922 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1923 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001924
Benjamin Peterson29060642009-01-31 22:14:21 +00001925 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001926 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001927 }
1928
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001929 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001930 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001931 }
1932 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001933 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001934 }
1935 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001936 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001937 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001938
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001939 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001940}
1941
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001942#ifdef Py_DEBUG
1943static int
1944unicode_is_singleton(PyObject *unicode)
1945{
1946 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1947 if (unicode == unicode_empty)
1948 return 1;
1949 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1950 {
1951 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1952 if (ch < 256 && unicode_latin1[ch] == unicode)
1953 return 1;
1954 }
1955 return 0;
1956}
1957#endif
1958
Alexander Belopolsky40018472011-02-26 01:02:56 +00001959static int
Victor Stinner488fa492011-12-12 00:01:39 +01001960unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001961{
Victor Stinner488fa492011-12-12 00:01:39 +01001962 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001963 if (Py_REFCNT(unicode) != 1)
1964 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001965 if (_PyUnicode_HASH(unicode) != -1)
1966 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001967 if (PyUnicode_CHECK_INTERNED(unicode))
1968 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001969 if (!PyUnicode_CheckExact(unicode))
1970 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001971#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001972 /* singleton refcount is greater than 1 */
1973 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001974#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001975 return 1;
1976}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001977
Victor Stinnerfe226c02011-10-03 03:52:20 +02001978static int
1979unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1980{
1981 PyObject *unicode;
1982 Py_ssize_t old_length;
1983
1984 assert(p_unicode != NULL);
1985 unicode = *p_unicode;
1986
1987 assert(unicode != NULL);
1988 assert(PyUnicode_Check(unicode));
1989 assert(0 <= length);
1990
Victor Stinner910337b2011-10-03 03:20:16 +02001991 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001992 old_length = PyUnicode_WSTR_LENGTH(unicode);
1993 else
1994 old_length = PyUnicode_GET_LENGTH(unicode);
1995 if (old_length == length)
1996 return 0;
1997
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001998 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001999 _Py_INCREF_UNICODE_EMPTY();
2000 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00002001 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002002 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002003 return 0;
2004 }
2005
Victor Stinner488fa492011-12-12 00:01:39 +01002006 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002007 PyObject *copy = resize_copy(unicode, length);
2008 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002009 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002010 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002011 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002012 }
2013
Victor Stinnerfe226c02011-10-03 03:52:20 +02002014 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002015 PyObject *new_unicode = resize_compact(unicode, length);
2016 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002017 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002018 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002019 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002020 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002021 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002022}
2023
Alexander Belopolsky40018472011-02-26 01:02:56 +00002024int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002025PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002026{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002027 PyObject *unicode;
2028 if (p_unicode == NULL) {
2029 PyErr_BadInternalCall();
2030 return -1;
2031 }
2032 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002033 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002034 {
2035 PyErr_BadInternalCall();
2036 return -1;
2037 }
2038 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002039}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002040
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002041/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002042
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002043 WARNING: The function doesn't copy the terminating null character and
2044 doesn't check the maximum character (may write a latin1 character in an
2045 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002046static void
2047unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2048 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002049{
2050 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002051 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002052 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002053
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002054 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002055 switch (kind) {
2056 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002057#ifdef Py_DEBUG
2058 if (PyUnicode_IS_ASCII(unicode)) {
2059 Py_UCS4 maxchar = ucs1lib_find_max_char(
2060 (const Py_UCS1*)str,
2061 (const Py_UCS1*)str + len);
2062 assert(maxchar < 128);
2063 }
2064#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002065 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002066 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002067 }
2068 case PyUnicode_2BYTE_KIND: {
2069 Py_UCS2 *start = (Py_UCS2 *)data + index;
2070 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002071
Victor Stinner184252a2012-06-16 02:57:41 +02002072 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002073 *ucs2 = (Py_UCS2)*str;
2074
2075 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002076 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002077 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002078 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002079 Py_UCS4 *start = (Py_UCS4 *)data + index;
2080 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002081
Victor Stinner184252a2012-06-16 02:57:41 +02002082 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002083 *ucs4 = (Py_UCS4)*str;
2084
2085 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002086 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002087 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002088 default:
2089 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002090 }
2091}
2092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002093static PyObject*
2094get_latin1_char(unsigned char ch)
2095{
Victor Stinnera464fc12011-10-02 20:39:30 +02002096 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002097 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02002098 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002099 if (!unicode)
2100 return NULL;
2101 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002102 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002103 unicode_latin1[ch] = unicode;
2104 }
2105 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002106 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002107}
2108
Victor Stinner985a82a2014-01-03 12:53:47 +01002109static PyObject*
2110unicode_char(Py_UCS4 ch)
2111{
2112 PyObject *unicode;
2113
2114 assert(ch <= MAX_UNICODE);
2115
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002116 if (ch < 256)
2117 return get_latin1_char(ch);
2118
Victor Stinner985a82a2014-01-03 12:53:47 +01002119 unicode = PyUnicode_New(1, ch);
2120 if (unicode == NULL)
2121 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002122
2123 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2124 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002125 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002126 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002127 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2128 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2129 }
2130 assert(_PyUnicode_CheckConsistency(unicode, 1));
2131 return unicode;
2132}
2133
Alexander Belopolsky40018472011-02-26 01:02:56 +00002134PyObject *
2135PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002136{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002137 if (u == NULL)
2138 return (PyObject*)_PyUnicode_New(size);
2139
2140 if (size < 0) {
2141 PyErr_BadInternalCall();
2142 return NULL;
2143 }
2144
2145 return PyUnicode_FromWideChar(u, size);
2146}
2147
2148PyObject *
2149PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2150{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002151 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002152 Py_UCS4 maxchar = 0;
2153 Py_ssize_t num_surrogates;
2154
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002155 if (u == NULL && size != 0) {
2156 PyErr_BadInternalCall();
2157 return NULL;
2158 }
2159
2160 if (size == -1) {
2161 size = wcslen(u);
2162 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002164 /* If the Unicode data is known at construction time, we can apply
2165 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002166
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002167 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002168 if (size == 0)
2169 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002171 /* Single character Unicode objects in the Latin-1 range are
2172 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002173 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002174 return get_latin1_char((unsigned char)*u);
2175
2176 /* If not empty and not single character, copy the Unicode data
2177 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002178 if (find_maxchar_surrogates(u, u + size,
2179 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002180 return NULL;
2181
Victor Stinner8faf8212011-12-08 22:14:11 +01002182 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183 if (!unicode)
2184 return NULL;
2185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002186 switch (PyUnicode_KIND(unicode)) {
2187 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002188 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002189 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2190 break;
2191 case PyUnicode_2BYTE_KIND:
2192#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002193 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002195 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002196 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2197#endif
2198 break;
2199 case PyUnicode_4BYTE_KIND:
2200#if SIZEOF_WCHAR_T == 2
2201 /* This is the only case which has to process surrogates, thus
2202 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002203 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002204#else
2205 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002206 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002207#endif
2208 break;
2209 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002210 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002211 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002212
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002213 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002214}
2215
Alexander Belopolsky40018472011-02-26 01:02:56 +00002216PyObject *
2217PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002218{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002219 if (size < 0) {
2220 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002221 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002222 return NULL;
2223 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002224 if (u != NULL)
2225 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2226 else
2227 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002228}
2229
Alexander Belopolsky40018472011-02-26 01:02:56 +00002230PyObject *
2231PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002232{
2233 size_t size = strlen(u);
2234 if (size > PY_SSIZE_T_MAX) {
2235 PyErr_SetString(PyExc_OverflowError, "input too long");
2236 return NULL;
2237 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002238 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002239}
2240
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002241PyObject *
2242_PyUnicode_FromId(_Py_Identifier *id)
2243{
2244 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002245 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2246 strlen(id->string),
2247 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002248 if (!id->object)
2249 return NULL;
2250 PyUnicode_InternInPlace(&id->object);
2251 assert(!id->next);
2252 id->next = static_strings;
2253 static_strings = id;
2254 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002255 return id->object;
2256}
2257
2258void
2259_PyUnicode_ClearStaticStrings()
2260{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002261 _Py_Identifier *tmp, *s = static_strings;
2262 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002263 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002264 tmp = s->next;
2265 s->next = NULL;
2266 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002267 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002268 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002269}
2270
Benjamin Peterson0df54292012-03-26 14:50:32 -04002271/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002272
Victor Stinnerd3f08822012-05-29 12:57:52 +02002273PyObject*
2274_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002275{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002276 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002277 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002278 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002279#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002280 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002281#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002282 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002283 }
Victor Stinner785938e2011-12-11 20:09:03 +01002284 unicode = PyUnicode_New(size, 127);
2285 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002286 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002287 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2288 assert(_PyUnicode_CheckConsistency(unicode, 1));
2289 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002290}
2291
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002292static Py_UCS4
2293kind_maxchar_limit(unsigned int kind)
2294{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002295 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002296 case PyUnicode_1BYTE_KIND:
2297 return 0x80;
2298 case PyUnicode_2BYTE_KIND:
2299 return 0x100;
2300 case PyUnicode_4BYTE_KIND:
2301 return 0x10000;
2302 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002303 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002304 }
2305}
2306
Victor Stinner702c7342011-10-05 13:50:52 +02002307static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002308_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002309{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002310 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002311 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002312
Serhiy Storchaka678db842013-01-26 12:16:36 +02002313 if (size == 0)
2314 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002315 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002316 if (size == 1)
2317 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002318
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002319 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002320 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002321 if (!res)
2322 return NULL;
2323 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002324 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002325 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002326}
2327
Victor Stinnere57b1c02011-09-28 22:20:48 +02002328static PyObject*
2329_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002330{
2331 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002332 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002333
Serhiy Storchaka678db842013-01-26 12:16:36 +02002334 if (size == 0)
2335 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002336 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002337 if (size == 1)
2338 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002339
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002340 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002341 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002342 if (!res)
2343 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002344 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002345 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002346 else {
2347 _PyUnicode_CONVERT_BYTES(
2348 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2349 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002350 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002351 return res;
2352}
2353
Victor Stinnere57b1c02011-09-28 22:20:48 +02002354static PyObject*
2355_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002356{
2357 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002358 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002359
Serhiy Storchaka678db842013-01-26 12:16:36 +02002360 if (size == 0)
2361 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002362 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002363 if (size == 1)
2364 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002365
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002366 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002367 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002368 if (!res)
2369 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002370 if (max_char < 256)
2371 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2372 PyUnicode_1BYTE_DATA(res));
2373 else if (max_char < 0x10000)
2374 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2375 PyUnicode_2BYTE_DATA(res));
2376 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002377 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002378 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002379 return res;
2380}
2381
2382PyObject*
2383PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2384{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002385 if (size < 0) {
2386 PyErr_SetString(PyExc_ValueError, "size must be positive");
2387 return NULL;
2388 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002389 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002390 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002391 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002392 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002393 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002394 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002395 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002396 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002397 PyErr_SetString(PyExc_SystemError, "invalid kind");
2398 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002399 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002400}
2401
Victor Stinnerece58de2012-04-23 23:36:38 +02002402Py_UCS4
2403_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2404{
2405 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002406 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002407
2408 assert(PyUnicode_IS_READY(unicode));
2409 assert(0 <= start);
2410 assert(end <= PyUnicode_GET_LENGTH(unicode));
2411 assert(start <= end);
2412
2413 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2414 return PyUnicode_MAX_CHAR_VALUE(unicode);
2415
2416 if (start == end)
2417 return 127;
2418
Victor Stinner94d558b2012-04-27 22:26:58 +02002419 if (PyUnicode_IS_ASCII(unicode))
2420 return 127;
2421
Victor Stinnerece58de2012-04-23 23:36:38 +02002422 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002423 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002424 endptr = (char *)startptr + end * kind;
2425 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002426 switch(kind) {
2427 case PyUnicode_1BYTE_KIND:
2428 return ucs1lib_find_max_char(startptr, endptr);
2429 case PyUnicode_2BYTE_KIND:
2430 return ucs2lib_find_max_char(startptr, endptr);
2431 case PyUnicode_4BYTE_KIND:
2432 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002433 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002434 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002435 }
2436}
2437
Victor Stinner25a4b292011-10-06 12:31:55 +02002438/* Ensure that a string uses the most efficient storage, if it is not the
2439 case: create a new string with of the right kind. Write NULL into *p_unicode
2440 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002441static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002442unicode_adjust_maxchar(PyObject **p_unicode)
2443{
2444 PyObject *unicode, *copy;
2445 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002446 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002447 unsigned int kind;
2448
2449 assert(p_unicode != NULL);
2450 unicode = *p_unicode;
2451 assert(PyUnicode_IS_READY(unicode));
2452 if (PyUnicode_IS_ASCII(unicode))
2453 return;
2454
2455 len = PyUnicode_GET_LENGTH(unicode);
2456 kind = PyUnicode_KIND(unicode);
2457 if (kind == PyUnicode_1BYTE_KIND) {
2458 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002459 max_char = ucs1lib_find_max_char(u, u + len);
2460 if (max_char >= 128)
2461 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002462 }
2463 else if (kind == PyUnicode_2BYTE_KIND) {
2464 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002465 max_char = ucs2lib_find_max_char(u, u + len);
2466 if (max_char >= 256)
2467 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002468 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002469 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002470 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002471 max_char = ucs4lib_find_max_char(u, u + len);
2472 if (max_char >= 0x10000)
2473 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002474 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002475 else
2476 Py_UNREACHABLE();
2477
Victor Stinner25a4b292011-10-06 12:31:55 +02002478 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002479 if (copy != NULL)
2480 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002481 Py_DECREF(unicode);
2482 *p_unicode = copy;
2483}
2484
Victor Stinner034f6cf2011-09-30 02:26:44 +02002485PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002486_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002487{
Victor Stinner87af4f22011-11-21 23:03:47 +01002488 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002489 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002490
Victor Stinner034f6cf2011-09-30 02:26:44 +02002491 if (!PyUnicode_Check(unicode)) {
2492 PyErr_BadInternalCall();
2493 return NULL;
2494 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002495 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002496 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002497
Victor Stinner87af4f22011-11-21 23:03:47 +01002498 length = PyUnicode_GET_LENGTH(unicode);
2499 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002500 if (!copy)
2501 return NULL;
2502 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2503
Christian Heimesf051e432016-09-13 20:22:02 +02002504 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002505 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002506 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002507 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002508}
2509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002510
Victor Stinnerbc603d12011-10-02 01:00:40 +02002511/* Widen Unicode objects to larger buffers. Don't write terminating null
2512 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002513
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002514static void*
2515unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002516{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002517 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002518
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002519 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002520 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002521 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002522 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002523 if (!result)
2524 return PyErr_NoMemory();
2525 assert(skind == PyUnicode_1BYTE_KIND);
2526 _PyUnicode_CONVERT_BYTES(
2527 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002528 (const Py_UCS1 *)data,
2529 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002530 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002531 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002532 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002533 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002534 if (!result)
2535 return PyErr_NoMemory();
2536 if (skind == PyUnicode_2BYTE_KIND) {
2537 _PyUnicode_CONVERT_BYTES(
2538 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002539 (const Py_UCS2 *)data,
2540 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002541 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002542 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002543 else {
2544 assert(skind == PyUnicode_1BYTE_KIND);
2545 _PyUnicode_CONVERT_BYTES(
2546 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002547 (const Py_UCS1 *)data,
2548 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002549 result);
2550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002551 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002552 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002553 Py_UNREACHABLE();
2554 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002555 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002556}
2557
2558static Py_UCS4*
2559as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2560 int copy_null)
2561{
2562 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002563 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002564 Py_ssize_t len, targetlen;
2565 if (PyUnicode_READY(string) == -1)
2566 return NULL;
2567 kind = PyUnicode_KIND(string);
2568 data = PyUnicode_DATA(string);
2569 len = PyUnicode_GET_LENGTH(string);
2570 targetlen = len;
2571 if (copy_null)
2572 targetlen++;
2573 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002574 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002575 if (!target) {
2576 PyErr_NoMemory();
2577 return NULL;
2578 }
2579 }
2580 else {
2581 if (targetsize < targetlen) {
2582 PyErr_Format(PyExc_SystemError,
2583 "string is longer than the buffer");
2584 if (copy_null && 0 < targetsize)
2585 target[0] = 0;
2586 return NULL;
2587 }
2588 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002589 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002590 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002591 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002592 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002593 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002594 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002595 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2596 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002597 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002598 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002599 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002600 else {
2601 Py_UNREACHABLE();
2602 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002603 if (copy_null)
2604 target[len] = 0;
2605 return target;
2606}
2607
2608Py_UCS4*
2609PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2610 int copy_null)
2611{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002612 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002613 PyErr_BadInternalCall();
2614 return NULL;
2615 }
2616 return as_ucs4(string, target, targetsize, copy_null);
2617}
2618
2619Py_UCS4*
2620PyUnicode_AsUCS4Copy(PyObject *string)
2621{
2622 return as_ucs4(string, NULL, 0, 1);
2623}
2624
Victor Stinner15a11362012-10-06 23:48:20 +02002625/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002626 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2627 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2628#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002629
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002630static int
2631unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2632 Py_ssize_t width, Py_ssize_t precision)
2633{
2634 Py_ssize_t length, fill, arglen;
2635 Py_UCS4 maxchar;
2636
2637 if (PyUnicode_READY(str) == -1)
2638 return -1;
2639
2640 length = PyUnicode_GET_LENGTH(str);
2641 if ((precision == -1 || precision >= length)
2642 && width <= length)
2643 return _PyUnicodeWriter_WriteStr(writer, str);
2644
2645 if (precision != -1)
2646 length = Py_MIN(precision, length);
2647
2648 arglen = Py_MAX(length, width);
2649 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2650 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2651 else
2652 maxchar = writer->maxchar;
2653
2654 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2655 return -1;
2656
2657 if (width > length) {
2658 fill = width - length;
2659 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2660 return -1;
2661 writer->pos += fill;
2662 }
2663
2664 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2665 str, 0, length);
2666 writer->pos += length;
2667 return 0;
2668}
2669
2670static int
Victor Stinner998b8062018-09-12 00:23:25 +02002671unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002672 Py_ssize_t width, Py_ssize_t precision)
2673{
2674 /* UTF-8 */
2675 Py_ssize_t length;
2676 PyObject *unicode;
2677 int res;
2678
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002679 if (precision == -1) {
2680 length = strlen(str);
2681 }
2682 else {
2683 length = 0;
2684 while (length < precision && str[length]) {
2685 length++;
2686 }
2687 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002688 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2689 if (unicode == NULL)
2690 return -1;
2691
2692 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2693 Py_DECREF(unicode);
2694 return res;
2695}
2696
Victor Stinner96865452011-03-01 23:44:09 +00002697static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002698unicode_fromformat_arg(_PyUnicodeWriter *writer,
2699 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002700{
Victor Stinnere215d962012-10-06 23:03:36 +02002701 const char *p;
2702 Py_ssize_t len;
2703 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002704 Py_ssize_t width;
2705 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002706 int longflag;
2707 int longlongflag;
2708 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002709 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002710
2711 p = f;
2712 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002713 zeropad = 0;
2714 if (*f == '0') {
2715 zeropad = 1;
2716 f++;
2717 }
Victor Stinner96865452011-03-01 23:44:09 +00002718
2719 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002720 width = -1;
2721 if (Py_ISDIGIT((unsigned)*f)) {
2722 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002723 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002724 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002725 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002726 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002727 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002728 return NULL;
2729 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002730 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002731 f++;
2732 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002733 }
2734 precision = -1;
2735 if (*f == '.') {
2736 f++;
2737 if (Py_ISDIGIT((unsigned)*f)) {
2738 precision = (*f - '0');
2739 f++;
2740 while (Py_ISDIGIT((unsigned)*f)) {
2741 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2742 PyErr_SetString(PyExc_ValueError,
2743 "precision too big");
2744 return NULL;
2745 }
2746 precision = (precision * 10) + (*f - '0');
2747 f++;
2748 }
2749 }
Victor Stinner96865452011-03-01 23:44:09 +00002750 if (*f == '%') {
2751 /* "%.3%s" => f points to "3" */
2752 f--;
2753 }
2754 }
2755 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002756 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002757 f--;
2758 }
Victor Stinner96865452011-03-01 23:44:09 +00002759
2760 /* Handle %ld, %lu, %lld and %llu. */
2761 longflag = 0;
2762 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002763 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002764 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002765 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002766 longflag = 1;
2767 ++f;
2768 }
Victor Stinner96865452011-03-01 23:44:09 +00002769 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002770 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002771 longlongflag = 1;
2772 f += 2;
2773 }
Victor Stinner96865452011-03-01 23:44:09 +00002774 }
2775 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002776 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002777 size_tflag = 1;
2778 ++f;
2779 }
Victor Stinnere215d962012-10-06 23:03:36 +02002780
2781 if (f[1] == '\0')
2782 writer->overallocate = 0;
2783
2784 switch (*f) {
2785 case 'c':
2786 {
2787 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002788 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002789 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002790 "character argument not in range(0x110000)");
2791 return NULL;
2792 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002793 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002794 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002795 break;
2796 }
2797
2798 case 'i':
2799 case 'd':
2800 case 'u':
2801 case 'x':
2802 {
2803 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002804 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002805 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002806
2807 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002808 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002809 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002810 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002811 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002812 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002813 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002814 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002815 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002816 va_arg(*vargs, size_t));
2817 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002818 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002819 va_arg(*vargs, unsigned int));
2820 }
2821 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002822 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002823 }
2824 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002825 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002826 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002827 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002828 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002829 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002830 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002831 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002832 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002833 va_arg(*vargs, Py_ssize_t));
2834 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002835 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002836 va_arg(*vargs, int));
2837 }
2838 assert(len >= 0);
2839
Victor Stinnere215d962012-10-06 23:03:36 +02002840 if (precision < len)
2841 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002842
2843 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002844 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2845 return NULL;
2846
Victor Stinnere215d962012-10-06 23:03:36 +02002847 if (width > precision) {
2848 Py_UCS4 fillchar;
2849 fill = width - precision;
2850 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002851 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2852 return NULL;
2853 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002854 }
Victor Stinner15a11362012-10-06 23:48:20 +02002855 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002856 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002857 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2858 return NULL;
2859 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002860 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002861
Victor Stinner4a587072013-11-19 12:54:53 +01002862 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2863 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002864 break;
2865 }
2866
2867 case 'p':
2868 {
2869 char number[MAX_LONG_LONG_CHARS];
2870
2871 len = sprintf(number, "%p", va_arg(*vargs, void*));
2872 assert(len >= 0);
2873
2874 /* %p is ill-defined: ensure leading 0x. */
2875 if (number[1] == 'X')
2876 number[1] = 'x';
2877 else if (number[1] != 'x') {
2878 memmove(number + 2, number,
2879 strlen(number) + 1);
2880 number[0] = '0';
2881 number[1] = 'x';
2882 len += 2;
2883 }
2884
Victor Stinner4a587072013-11-19 12:54:53 +01002885 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002886 return NULL;
2887 break;
2888 }
2889
2890 case 's':
2891 {
2892 /* UTF-8 */
2893 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002894 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002895 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002896 break;
2897 }
2898
2899 case 'U':
2900 {
2901 PyObject *obj = va_arg(*vargs, PyObject *);
2902 assert(obj && _PyUnicode_CHECK(obj));
2903
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002904 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002905 return NULL;
2906 break;
2907 }
2908
2909 case 'V':
2910 {
2911 PyObject *obj = va_arg(*vargs, PyObject *);
2912 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002913 if (obj) {
2914 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002915 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002916 return NULL;
2917 }
2918 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002919 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002920 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002921 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002922 }
2923 break;
2924 }
2925
2926 case 'S':
2927 {
2928 PyObject *obj = va_arg(*vargs, PyObject *);
2929 PyObject *str;
2930 assert(obj);
2931 str = PyObject_Str(obj);
2932 if (!str)
2933 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002934 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002935 Py_DECREF(str);
2936 return NULL;
2937 }
2938 Py_DECREF(str);
2939 break;
2940 }
2941
2942 case 'R':
2943 {
2944 PyObject *obj = va_arg(*vargs, PyObject *);
2945 PyObject *repr;
2946 assert(obj);
2947 repr = PyObject_Repr(obj);
2948 if (!repr)
2949 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002950 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002951 Py_DECREF(repr);
2952 return NULL;
2953 }
2954 Py_DECREF(repr);
2955 break;
2956 }
2957
2958 case 'A':
2959 {
2960 PyObject *obj = va_arg(*vargs, PyObject *);
2961 PyObject *ascii;
2962 assert(obj);
2963 ascii = PyObject_ASCII(obj);
2964 if (!ascii)
2965 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002966 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002967 Py_DECREF(ascii);
2968 return NULL;
2969 }
2970 Py_DECREF(ascii);
2971 break;
2972 }
2973
2974 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002975 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002976 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002977 break;
2978
2979 default:
2980 /* if we stumble upon an unknown formatting code, copy the rest
2981 of the format string to the output string. (we cannot just
2982 skip the code, since there's no way to know what's in the
2983 argument list) */
2984 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002985 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002986 return NULL;
2987 f = p+len;
2988 return f;
2989 }
2990
2991 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002992 return f;
2993}
2994
Walter Dörwaldd2034312007-05-18 16:29:38 +00002995PyObject *
2996PyUnicode_FromFormatV(const char *format, va_list vargs)
2997{
Victor Stinnere215d962012-10-06 23:03:36 +02002998 va_list vargs2;
2999 const char *f;
3000 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003001
Victor Stinner8f674cc2013-04-17 23:02:17 +02003002 _PyUnicodeWriter_Init(&writer);
3003 writer.min_length = strlen(format) + 100;
3004 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003005
Benjamin Peterson0c212142016-09-20 20:39:33 -07003006 // Copy varags to be able to pass a reference to a subfunction.
3007 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003008
3009 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003010 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003011 f = unicode_fromformat_arg(&writer, f, &vargs2);
3012 if (f == NULL)
3013 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003014 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003015 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003016 const char *p;
3017 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003018
Victor Stinnere215d962012-10-06 23:03:36 +02003019 p = f;
3020 do
3021 {
3022 if ((unsigned char)*p > 127) {
3023 PyErr_Format(PyExc_ValueError,
3024 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3025 "string, got a non-ASCII byte: 0x%02x",
3026 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003027 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003028 }
3029 p++;
3030 }
3031 while (*p != '\0' && *p != '%');
3032 len = p - f;
3033
3034 if (*p == '\0')
3035 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003036
3037 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003038 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003039
3040 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003041 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003042 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003043 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003044 return _PyUnicodeWriter_Finish(&writer);
3045
3046 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003047 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003048 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003049 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003050}
3051
Walter Dörwaldd2034312007-05-18 16:29:38 +00003052PyObject *
3053PyUnicode_FromFormat(const char *format, ...)
3054{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003055 PyObject* ret;
3056 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003057
3058#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003059 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003060#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003061 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003062#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003063 ret = PyUnicode_FromFormatV(format, vargs);
3064 va_end(vargs);
3065 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003066}
3067
Serhiy Storchakac46db922018-10-23 22:58:24 +03003068static Py_ssize_t
3069unicode_get_widechar_size(PyObject *unicode)
3070{
3071 Py_ssize_t res;
3072
3073 assert(unicode != NULL);
3074 assert(_PyUnicode_CHECK(unicode));
3075
3076 if (_PyUnicode_WSTR(unicode) != NULL) {
3077 return PyUnicode_WSTR_LENGTH(unicode);
3078 }
3079 assert(PyUnicode_IS_READY(unicode));
3080
3081 res = _PyUnicode_LENGTH(unicode);
3082#if SIZEOF_WCHAR_T == 2
3083 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3084 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3085 const Py_UCS4 *end = s + res;
3086 for (; s < end; ++s) {
3087 if (*s > 0xFFFF) {
3088 ++res;
3089 }
3090 }
3091 }
3092#endif
3093 return res;
3094}
3095
3096static void
3097unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3098{
3099 const wchar_t *wstr;
3100
3101 assert(unicode != NULL);
3102 assert(_PyUnicode_CHECK(unicode));
3103
3104 wstr = _PyUnicode_WSTR(unicode);
3105 if (wstr != NULL) {
3106 memcpy(w, wstr, size * sizeof(wchar_t));
3107 return;
3108 }
3109 assert(PyUnicode_IS_READY(unicode));
3110
3111 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3112 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3113 for (; size--; ++s, ++w) {
3114 *w = *s;
3115 }
3116 }
3117 else {
3118#if SIZEOF_WCHAR_T == 4
3119 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3120 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3121 for (; size--; ++s, ++w) {
3122 *w = *s;
3123 }
3124#else
3125 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3126 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3127 for (; size--; ++s, ++w) {
3128 Py_UCS4 ch = *s;
3129 if (ch > 0xFFFF) {
3130 assert(ch <= MAX_UNICODE);
3131 /* encode surrogate pair in this case */
3132 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3133 if (!size--)
3134 break;
3135 *w = Py_UNICODE_LOW_SURROGATE(ch);
3136 }
3137 else {
3138 *w = ch;
3139 }
3140 }
3141#endif
3142 }
3143}
3144
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003145#ifdef HAVE_WCHAR_H
3146
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003147/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003148
Victor Stinnerd88d9832011-09-06 02:00:05 +02003149 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003150 character) required to convert the unicode object. Ignore size argument.
3151
Victor Stinnerd88d9832011-09-06 02:00:05 +02003152 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003153 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003154 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003155Py_ssize_t
3156PyUnicode_AsWideChar(PyObject *unicode,
3157 wchar_t *w,
3158 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003159{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003160 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003161
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003162 if (unicode == NULL) {
3163 PyErr_BadInternalCall();
3164 return -1;
3165 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003166 if (!PyUnicode_Check(unicode)) {
3167 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003168 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003169 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003170
3171 res = unicode_get_widechar_size(unicode);
3172 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003173 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003174 }
3175
3176 if (size > res) {
3177 size = res + 1;
3178 }
3179 else {
3180 res = size;
3181 }
3182 unicode_copy_as_widechar(unicode, w, size);
3183 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003184}
3185
Victor Stinner137c34c2010-09-29 10:25:54 +00003186wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003187PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003188 Py_ssize_t *size)
3189{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003190 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003191 Py_ssize_t buflen;
3192
3193 if (unicode == NULL) {
3194 PyErr_BadInternalCall();
3195 return NULL;
3196 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003197 if (!PyUnicode_Check(unicode)) {
3198 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003199 return NULL;
3200 }
3201
Serhiy Storchakac46db922018-10-23 22:58:24 +03003202 buflen = unicode_get_widechar_size(unicode);
3203 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003204 if (buffer == NULL) {
3205 PyErr_NoMemory();
3206 return NULL;
3207 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003208 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3209 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003210 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003211 }
3212 else if (wcslen(buffer) != (size_t)buflen) {
3213 PyMem_FREE(buffer);
3214 PyErr_SetString(PyExc_ValueError,
3215 "embedded null character");
3216 return NULL;
3217 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003218 return buffer;
3219}
3220
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003221#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003222
Alexander Belopolsky40018472011-02-26 01:02:56 +00003223PyObject *
3224PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003225{
Victor Stinner8faf8212011-12-08 22:14:11 +01003226 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003227 PyErr_SetString(PyExc_ValueError,
3228 "chr() arg not in range(0x110000)");
3229 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003230 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003231
Victor Stinner985a82a2014-01-03 12:53:47 +01003232 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003233}
3234
Alexander Belopolsky40018472011-02-26 01:02:56 +00003235PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003236PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003238 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003239 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003240 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003241 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003242 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003243 Py_INCREF(obj);
3244 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003245 }
3246 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003247 /* For a Unicode subtype that's not a Unicode object,
3248 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003249 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003250 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003251 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003252 "Can't convert '%.100s' object to str implicitly",
3253 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003254 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003255}
3256
Alexander Belopolsky40018472011-02-26 01:02:56 +00003257PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003258PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003259 const char *encoding,
3260 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003261{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003262 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003263 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003264
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003266 PyErr_BadInternalCall();
3267 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003268 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003269
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003270 /* Decoding bytes objects is the most common case and should be fast */
3271 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003272 if (PyBytes_GET_SIZE(obj) == 0) {
3273 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3274 return NULL;
3275 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003276 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003277 }
3278 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003279 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3280 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003281 }
3282
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003283 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003284 PyErr_SetString(PyExc_TypeError,
3285 "decoding str is not supported");
3286 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003287 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003288
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003289 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3290 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3291 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003292 "decoding to str: need a bytes-like object, %.80s found",
3293 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003294 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003295 }
Tim Petersced69f82003-09-16 20:30:58 +00003296
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003297 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003298 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003299 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3300 return NULL;
3301 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003302 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003304
Serhiy Storchaka05997252013-01-26 12:14:02 +02003305 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003306 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003307 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003308}
3309
Victor Stinnerebe17e02016-10-12 13:57:45 +02003310/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3311 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3312 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003313int
3314_Py_normalize_encoding(const char *encoding,
3315 char *lower,
3316 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003317{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003318 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003319 char *l;
3320 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003321 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003322
Victor Stinner942889a2016-09-05 15:40:10 -07003323 assert(encoding != NULL);
3324
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003325 e = encoding;
3326 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003327 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003328 punct = 0;
3329 while (1) {
3330 char c = *e;
3331 if (c == 0) {
3332 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003333 }
Victor Stinner942889a2016-09-05 15:40:10 -07003334
3335 if (Py_ISALNUM(c) || c == '.') {
3336 if (punct && l != lower) {
3337 if (l == l_end) {
3338 return 0;
3339 }
3340 *l++ = '_';
3341 }
3342 punct = 0;
3343
3344 if (l == l_end) {
3345 return 0;
3346 }
3347 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003348 }
3349 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003350 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003351 }
Victor Stinner942889a2016-09-05 15:40:10 -07003352
3353 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003354 }
3355 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003356 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003357}
3358
Alexander Belopolsky40018472011-02-26 01:02:56 +00003359PyObject *
3360PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003361 Py_ssize_t size,
3362 const char *encoding,
3363 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003364{
3365 PyObject *buffer = NULL, *unicode;
3366 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003367 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3368
Victor Stinner22eb6892019-06-26 00:51:05 +02003369 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3370 return NULL;
3371 }
3372
Victor Stinnered076ed2019-06-26 01:49:32 +02003373 if (size == 0) {
3374 _Py_RETURN_UNICODE_EMPTY();
3375 }
3376
Victor Stinner942889a2016-09-05 15:40:10 -07003377 if (encoding == NULL) {
3378 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3379 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003380
Fred Drakee4315f52000-05-09 19:53:39 +00003381 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003382 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3383 char *lower = buflower;
3384
3385 /* Fast paths */
3386 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3387 lower += 3;
3388 if (*lower == '_') {
3389 /* Match "utf8" and "utf_8" */
3390 lower++;
3391 }
3392
3393 if (lower[0] == '8' && lower[1] == 0) {
3394 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3395 }
3396 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3397 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3398 }
3399 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3400 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3401 }
3402 }
3403 else {
3404 if (strcmp(lower, "ascii") == 0
3405 || strcmp(lower, "us_ascii") == 0) {
3406 return PyUnicode_DecodeASCII(s, size, errors);
3407 }
Steve Dowercc16be82016-09-08 10:35:16 -07003408 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003409 else if (strcmp(lower, "mbcs") == 0) {
3410 return PyUnicode_DecodeMBCS(s, size, errors);
3411 }
3412 #endif
3413 else if (strcmp(lower, "latin1") == 0
3414 || strcmp(lower, "latin_1") == 0
3415 || strcmp(lower, "iso_8859_1") == 0
3416 || strcmp(lower, "iso8859_1") == 0) {
3417 return PyUnicode_DecodeLatin1(s, size, errors);
3418 }
3419 }
Victor Stinner37296e82010-06-10 13:36:23 +00003420 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003421
3422 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003423 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003424 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003425 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003426 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003427 if (buffer == NULL)
3428 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003429 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003430 if (unicode == NULL)
3431 goto onError;
3432 if (!PyUnicode_Check(unicode)) {
3433 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003434 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003435 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003436 encoding,
3437 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003438 Py_DECREF(unicode);
3439 goto onError;
3440 }
3441 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003442 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003443
Benjamin Peterson29060642009-01-31 22:14:21 +00003444 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003445 Py_XDECREF(buffer);
3446 return NULL;
3447}
3448
Alexander Belopolsky40018472011-02-26 01:02:56 +00003449PyObject *
3450PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003451 const char *encoding,
3452 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003453{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003454 if (!PyUnicode_Check(unicode)) {
3455 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003456 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003457 }
3458
Serhiy Storchaka00939072016-10-27 21:05:49 +03003459 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3460 "PyUnicode_AsDecodedObject() is deprecated; "
3461 "use PyCodec_Decode() to decode from str", 1) < 0)
3462 return NULL;
3463
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003464 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003465 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003466
3467 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003468 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003469}
3470
Alexander Belopolsky40018472011-02-26 01:02:56 +00003471PyObject *
3472PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003473 const char *encoding,
3474 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003475{
3476 PyObject *v;
3477
3478 if (!PyUnicode_Check(unicode)) {
3479 PyErr_BadArgument();
3480 goto onError;
3481 }
3482
Serhiy Storchaka00939072016-10-27 21:05:49 +03003483 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3484 "PyUnicode_AsDecodedUnicode() is deprecated; "
3485 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3486 return NULL;
3487
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003488 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003489 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003490
3491 /* Decode via the codec registry */
3492 v = PyCodec_Decode(unicode, encoding, errors);
3493 if (v == NULL)
3494 goto onError;
3495 if (!PyUnicode_Check(v)) {
3496 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003497 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003498 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003499 encoding,
3500 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003501 Py_DECREF(v);
3502 goto onError;
3503 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003504 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003505
Benjamin Peterson29060642009-01-31 22:14:21 +00003506 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003507 return NULL;
3508}
3509
Alexander Belopolsky40018472011-02-26 01:02:56 +00003510PyObject *
3511PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003512 Py_ssize_t size,
3513 const char *encoding,
3514 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003515{
3516 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003517
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003518 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003519 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003520 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003521 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3522 Py_DECREF(unicode);
3523 return v;
3524}
3525
Alexander Belopolsky40018472011-02-26 01:02:56 +00003526PyObject *
3527PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003528 const char *encoding,
3529 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003530{
3531 PyObject *v;
3532
3533 if (!PyUnicode_Check(unicode)) {
3534 PyErr_BadArgument();
3535 goto onError;
3536 }
3537
Serhiy Storchaka00939072016-10-27 21:05:49 +03003538 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3539 "PyUnicode_AsEncodedObject() is deprecated; "
3540 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3541 "or PyCodec_Encode() for generic encoding", 1) < 0)
3542 return NULL;
3543
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003544 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003545 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003546
3547 /* Encode via the codec registry */
3548 v = PyCodec_Encode(unicode, encoding, errors);
3549 if (v == NULL)
3550 goto onError;
3551 return v;
3552
Benjamin Peterson29060642009-01-31 22:14:21 +00003553 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003554 return NULL;
3555}
3556
Victor Stinner1b579672011-12-17 05:47:23 +01003557
Victor Stinner2cba6b82018-01-10 22:46:15 +01003558static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003559unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003560 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003561{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003562 Py_ssize_t wlen;
3563 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3564 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003565 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003566 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003567
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003568 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003569 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003570 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003571 return NULL;
3572 }
3573
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003574 char *str;
3575 size_t error_pos;
3576 const char *reason;
3577 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003578 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003579 PyMem_Free(wstr);
3580
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003581 if (res != 0) {
3582 if (res == -2) {
3583 PyObject *exc;
3584 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3585 "locale", unicode,
3586 (Py_ssize_t)error_pos,
3587 (Py_ssize_t)(error_pos+1),
3588 reason);
3589 if (exc != NULL) {
3590 PyCodec_StrictErrors(exc);
3591 Py_DECREF(exc);
3592 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003593 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003594 else if (res == -3) {
3595 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3596 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003597 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003598 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003599 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003600 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003601 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003602
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003603 PyObject *bytes = PyBytes_FromString(str);
3604 PyMem_RawFree(str);
3605 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003606}
3607
Victor Stinnerad158722010-10-27 00:25:46 +00003608PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003609PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3610{
Victor Stinner709d23d2019-05-02 14:56:30 -04003611 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3612 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003613}
3614
3615PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003616PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003617{
Victor Stinner81a7be32020-04-14 15:14:01 +02003618 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003619 if (interp->fs_codec.utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003620 return unicode_encode_utf8(unicode,
3621 interp->fs_codec.error_handler,
3622 interp->fs_codec.errors);
3623 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003624#ifndef _Py_FORCE_UTF8_FS_ENCODING
3625 else if (interp->fs_codec.encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003626 return PyUnicode_AsEncodedString(unicode,
Victor Stinner709d23d2019-05-02 14:56:30 -04003627 interp->fs_codec.encoding,
3628 interp->fs_codec.errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003629 }
Victor Stinnerad158722010-10-27 00:25:46 +00003630#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003631 else {
3632 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3633 machinery is not ready and so cannot be used:
3634 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003635 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3636 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003637 assert(filesystem_errors != NULL);
3638 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3639 assert(errors != _Py_ERROR_UNKNOWN);
3640#ifdef _Py_FORCE_UTF8_FS_ENCODING
3641 return unicode_encode_utf8(unicode, errors, NULL);
3642#else
3643 return unicode_encode_locale(unicode, errors, 0);
3644#endif
3645 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003646}
3647
Alexander Belopolsky40018472011-02-26 01:02:56 +00003648PyObject *
3649PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003650 const char *encoding,
3651 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652{
3653 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003654 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003655
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656 if (!PyUnicode_Check(unicode)) {
3657 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003658 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003659 }
Fred Drakee4315f52000-05-09 19:53:39 +00003660
Victor Stinner22eb6892019-06-26 00:51:05 +02003661 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3662 return NULL;
3663 }
3664
Victor Stinner942889a2016-09-05 15:40:10 -07003665 if (encoding == NULL) {
3666 return _PyUnicode_AsUTF8String(unicode, errors);
3667 }
3668
Fred Drakee4315f52000-05-09 19:53:39 +00003669 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003670 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3671 char *lower = buflower;
3672
3673 /* Fast paths */
3674 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3675 lower += 3;
3676 if (*lower == '_') {
3677 /* Match "utf8" and "utf_8" */
3678 lower++;
3679 }
3680
3681 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003682 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003683 }
3684 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3685 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3686 }
3687 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3688 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3689 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003690 }
Victor Stinner942889a2016-09-05 15:40:10 -07003691 else {
3692 if (strcmp(lower, "ascii") == 0
3693 || strcmp(lower, "us_ascii") == 0) {
3694 return _PyUnicode_AsASCIIString(unicode, errors);
3695 }
Steve Dowercc16be82016-09-08 10:35:16 -07003696#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003697 else if (strcmp(lower, "mbcs") == 0) {
3698 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3699 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003700#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003701 else if (strcmp(lower, "latin1") == 0 ||
3702 strcmp(lower, "latin_1") == 0 ||
3703 strcmp(lower, "iso_8859_1") == 0 ||
3704 strcmp(lower, "iso8859_1") == 0) {
3705 return _PyUnicode_AsLatin1String(unicode, errors);
3706 }
3707 }
Victor Stinner37296e82010-06-10 13:36:23 +00003708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709
3710 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003711 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003712 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003713 return NULL;
3714
3715 /* The normal path */
3716 if (PyBytes_Check(v))
3717 return v;
3718
3719 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003720 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003721 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003722 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003723
3724 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003725 "encoder %s returned bytearray instead of bytes; "
3726 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003727 encoding);
3728 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003729 Py_DECREF(v);
3730 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003731 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003732
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003733 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3734 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003735 Py_DECREF(v);
3736 return b;
3737 }
3738
3739 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003740 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003741 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003742 encoding,
3743 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003744 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003745 return NULL;
3746}
3747
Alexander Belopolsky40018472011-02-26 01:02:56 +00003748PyObject *
3749PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003750 const char *encoding,
3751 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003752{
3753 PyObject *v;
3754
3755 if (!PyUnicode_Check(unicode)) {
3756 PyErr_BadArgument();
3757 goto onError;
3758 }
3759
Serhiy Storchaka00939072016-10-27 21:05:49 +03003760 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3761 "PyUnicode_AsEncodedUnicode() is deprecated; "
3762 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3763 return NULL;
3764
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003765 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003766 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003767
3768 /* Encode via the codec registry */
3769 v = PyCodec_Encode(unicode, encoding, errors);
3770 if (v == NULL)
3771 goto onError;
3772 if (!PyUnicode_Check(v)) {
3773 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003774 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003775 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003776 encoding,
3777 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003778 Py_DECREF(v);
3779 goto onError;
3780 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003782
Benjamin Peterson29060642009-01-31 22:14:21 +00003783 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784 return NULL;
3785}
3786
Victor Stinner2cba6b82018-01-10 22:46:15 +01003787static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003788unicode_decode_locale(const char *str, Py_ssize_t len,
3789 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003790{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003791 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3792 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003793 return NULL;
3794 }
3795
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003796 wchar_t *wstr;
3797 size_t wlen;
3798 const char *reason;
3799 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003800 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003801 if (res != 0) {
3802 if (res == -2) {
3803 PyObject *exc;
3804 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3805 "locale", str, len,
3806 (Py_ssize_t)wlen,
3807 (Py_ssize_t)(wlen + 1),
3808 reason);
3809 if (exc != NULL) {
3810 PyCodec_StrictErrors(exc);
3811 Py_DECREF(exc);
3812 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003813 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003814 else if (res == -3) {
3815 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3816 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003817 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003818 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003819 }
Victor Stinner2f197072011-12-17 07:08:30 +01003820 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003821 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003822
3823 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3824 PyMem_RawFree(wstr);
3825 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003826}
3827
3828PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003829PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3830 const char *errors)
3831{
Victor Stinner709d23d2019-05-02 14:56:30 -04003832 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3833 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003834}
3835
3836PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003837PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003838{
3839 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003840 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3841 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003842}
3843
3844
3845PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003846PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003847 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003848 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3849}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003850
Christian Heimes5894ba72007-11-04 11:43:14 +00003851PyObject*
3852PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3853{
Victor Stinner81a7be32020-04-14 15:14:01 +02003854 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003855 if (interp->fs_codec.utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003856 return unicode_decode_utf8(s, size,
3857 interp->fs_codec.error_handler,
3858 interp->fs_codec.errors,
3859 NULL);
3860 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003861#ifndef _Py_FORCE_UTF8_FS_ENCODING
3862 else if (interp->fs_codec.encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003863 return PyUnicode_Decode(s, size,
Victor Stinner709d23d2019-05-02 14:56:30 -04003864 interp->fs_codec.encoding,
3865 interp->fs_codec.errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003866 }
Victor Stinnerad158722010-10-27 00:25:46 +00003867#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003868 else {
3869 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3870 machinery is not ready and so cannot be used:
3871 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003872 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3873 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003874 assert(filesystem_errors != NULL);
3875 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3876 assert(errors != _Py_ERROR_UNKNOWN);
3877#ifdef _Py_FORCE_UTF8_FS_ENCODING
3878 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3879#else
3880 return unicode_decode_locale(s, size, errors, 0);
3881#endif
3882 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003883}
3884
Martin v. Löwis011e8422009-05-05 04:43:17 +00003885
3886int
3887PyUnicode_FSConverter(PyObject* arg, void* addr)
3888{
Brett Cannonec6ce872016-09-06 15:50:29 -07003889 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003890 PyObject *output = NULL;
3891 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03003892 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003893 if (arg == NULL) {
3894 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003895 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003896 return 1;
3897 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003898 path = PyOS_FSPath(arg);
3899 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003900 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003901 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003902 if (PyBytes_Check(path)) {
3903 output = path;
3904 }
3905 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3906 output = PyUnicode_EncodeFSDefault(path);
3907 Py_DECREF(path);
3908 if (!output) {
3909 return 0;
3910 }
3911 assert(PyBytes_Check(output));
3912 }
3913
Victor Stinner0ea2a462010-04-30 00:22:08 +00003914 size = PyBytes_GET_SIZE(output);
3915 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003916 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003917 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003918 Py_DECREF(output);
3919 return 0;
3920 }
3921 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003922 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003923}
3924
3925
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003926int
3927PyUnicode_FSDecoder(PyObject* arg, void* addr)
3928{
Brett Cannona5711202016-09-06 19:36:01 -07003929 int is_buffer = 0;
3930 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003931 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003932 if (arg == NULL) {
3933 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003934 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003935 return 1;
3936 }
Brett Cannona5711202016-09-06 19:36:01 -07003937
3938 is_buffer = PyObject_CheckBuffer(arg);
3939 if (!is_buffer) {
3940 path = PyOS_FSPath(arg);
3941 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003942 return 0;
3943 }
Brett Cannona5711202016-09-06 19:36:01 -07003944 }
3945 else {
3946 path = arg;
3947 Py_INCREF(arg);
3948 }
3949
3950 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003951 output = path;
3952 }
3953 else if (PyBytes_Check(path) || is_buffer) {
3954 PyObject *path_bytes = NULL;
3955
3956 if (!PyBytes_Check(path) &&
3957 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003958 "path should be string, bytes, or os.PathLike, not %.200s",
3959 Py_TYPE(arg)->tp_name)) {
3960 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003961 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003962 }
3963 path_bytes = PyBytes_FromObject(path);
3964 Py_DECREF(path);
3965 if (!path_bytes) {
3966 return 0;
3967 }
3968 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3969 PyBytes_GET_SIZE(path_bytes));
3970 Py_DECREF(path_bytes);
3971 if (!output) {
3972 return 0;
3973 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003974 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003975 else {
3976 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003977 "path should be string, bytes, or os.PathLike, not %.200s",
3978 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003979 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003980 return 0;
3981 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003982 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003983 Py_DECREF(output);
3984 return 0;
3985 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003986 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003987 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003988 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003989 Py_DECREF(output);
3990 return 0;
3991 }
3992 *(PyObject**)addr = output;
3993 return Py_CLEANUP_SUPPORTED;
3994}
3995
3996
Inada Naoki02a4d572020-02-27 13:48:59 +09003997static int unicode_fill_utf8(PyObject *unicode);
3998
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003999const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004000PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004001{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004002 if (!PyUnicode_Check(unicode)) {
4003 PyErr_BadArgument();
4004 return NULL;
4005 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004006 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004007 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004008
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004009 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004010 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004011 return NULL;
4012 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004013 }
4014
4015 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004016 *psize = PyUnicode_UTF8_LENGTH(unicode);
4017 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004018}
4019
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004020const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004021PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004022{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004023 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4024}
4025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004026Py_UNICODE *
4027PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4028{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004029 if (!PyUnicode_Check(unicode)) {
4030 PyErr_BadArgument();
4031 return NULL;
4032 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004033 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4034 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004035 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004036 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004037 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004038
Serhiy Storchakac46db922018-10-23 22:58:24 +03004039 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4040 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4041 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004042 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004043 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004044 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4045 if (w == NULL) {
4046 PyErr_NoMemory();
4047 return NULL;
4048 }
4049 unicode_copy_as_widechar(unicode, w, wlen + 1);
4050 _PyUnicode_WSTR(unicode) = w;
4051 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4052 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004053 }
4054 }
4055 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004056 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004057 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004058}
4059
Alexander Belopolsky40018472011-02-26 01:02:56 +00004060Py_UNICODE *
4061PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004062{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004063 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004064}
4065
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004066const Py_UNICODE *
4067_PyUnicode_AsUnicode(PyObject *unicode)
4068{
4069 Py_ssize_t size;
4070 const Py_UNICODE *wstr;
4071
4072 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4073 if (wstr && wcslen(wstr) != (size_t)size) {
4074 PyErr_SetString(PyExc_ValueError, "embedded null character");
4075 return NULL;
4076 }
4077 return wstr;
4078}
4079
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004080
Alexander Belopolsky40018472011-02-26 01:02:56 +00004081Py_ssize_t
4082PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004083{
4084 if (!PyUnicode_Check(unicode)) {
4085 PyErr_BadArgument();
4086 goto onError;
4087 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004088 if (_PyUnicode_WSTR(unicode) == NULL) {
4089 if (PyUnicode_AsUnicode(unicode) == NULL)
4090 goto onError;
4091 }
4092 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004093
Benjamin Peterson29060642009-01-31 22:14:21 +00004094 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004095 return -1;
4096}
4097
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004098Py_ssize_t
4099PyUnicode_GetLength(PyObject *unicode)
4100{
Victor Stinner07621332012-06-16 04:53:46 +02004101 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004102 PyErr_BadArgument();
4103 return -1;
4104 }
Victor Stinner07621332012-06-16 04:53:46 +02004105 if (PyUnicode_READY(unicode) == -1)
4106 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004107 return PyUnicode_GET_LENGTH(unicode);
4108}
4109
4110Py_UCS4
4111PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4112{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004113 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004114 int kind;
4115
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004116 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004117 PyErr_BadArgument();
4118 return (Py_UCS4)-1;
4119 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004120 if (PyUnicode_READY(unicode) == -1) {
4121 return (Py_UCS4)-1;
4122 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004123 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004124 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004125 return (Py_UCS4)-1;
4126 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004127 data = PyUnicode_DATA(unicode);
4128 kind = PyUnicode_KIND(unicode);
4129 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004130}
4131
4132int
4133PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4134{
4135 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004136 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004137 return -1;
4138 }
Victor Stinner488fa492011-12-12 00:01:39 +01004139 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004140 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004141 PyErr_SetString(PyExc_IndexError, "string index out of range");
4142 return -1;
4143 }
Victor Stinner488fa492011-12-12 00:01:39 +01004144 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004145 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004146 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4147 PyErr_SetString(PyExc_ValueError, "character out of range");
4148 return -1;
4149 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004150 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4151 index, ch);
4152 return 0;
4153}
4154
Alexander Belopolsky40018472011-02-26 01:02:56 +00004155const char *
4156PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004157{
Victor Stinner42cb4622010-09-01 19:39:01 +00004158 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004159}
4160
Victor Stinner554f3f02010-06-16 23:33:54 +00004161/* create or adjust a UnicodeDecodeError */
4162static void
4163make_decode_exception(PyObject **exceptionObject,
4164 const char *encoding,
4165 const char *input, Py_ssize_t length,
4166 Py_ssize_t startpos, Py_ssize_t endpos,
4167 const char *reason)
4168{
4169 if (*exceptionObject == NULL) {
4170 *exceptionObject = PyUnicodeDecodeError_Create(
4171 encoding, input, length, startpos, endpos, reason);
4172 }
4173 else {
4174 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4175 goto onError;
4176 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4177 goto onError;
4178 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4179 goto onError;
4180 }
4181 return;
4182
4183onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004184 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004185}
4186
Steve Dowercc16be82016-09-08 10:35:16 -07004187#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004188static int
4189widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4190{
4191 if (newsize > *size) {
4192 wchar_t *newbuf = *buf;
4193 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4194 PyErr_NoMemory();
4195 return -1;
4196 }
4197 *buf = newbuf;
4198 }
4199 *size = newsize;
4200 return 0;
4201}
4202
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004203/* error handling callback helper:
4204 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004205 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004206 and adjust various state variables.
4207 return 0 on success, -1 on error
4208*/
4209
Alexander Belopolsky40018472011-02-26 01:02:56 +00004210static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004211unicode_decode_call_errorhandler_wchar(
4212 const char *errors, PyObject **errorHandler,
4213 const char *encoding, const char *reason,
4214 const char **input, const char **inend, Py_ssize_t *startinpos,
4215 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004216 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004217{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004218 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004219
4220 PyObject *restuple = NULL;
4221 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004222 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004223 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004224 Py_ssize_t requiredsize;
4225 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004226 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004227 wchar_t *repwstr;
4228 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004229
4230 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004231 *errorHandler = PyCodec_LookupError(errors);
4232 if (*errorHandler == NULL)
4233 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004234 }
4235
Victor Stinner554f3f02010-06-16 23:33:54 +00004236 make_decode_exception(exceptionObject,
4237 encoding,
4238 *input, *inend - *input,
4239 *startinpos, *endinpos,
4240 reason);
4241 if (*exceptionObject == NULL)
4242 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004243
Petr Viktorinffd97532020-02-11 17:46:57 +01004244 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004245 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004246 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004247 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004248 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004249 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004250 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004251 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004252 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004253
4254 /* Copy back the bytes variables, which might have been modified by the
4255 callback */
4256 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4257 if (!inputobj)
4258 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004259 *input = PyBytes_AS_STRING(inputobj);
4260 insize = PyBytes_GET_SIZE(inputobj);
4261 *inend = *input + insize;
4262 /* we can DECREF safely, as the exception has another reference,
4263 so the object won't go away. */
4264 Py_DECREF(inputobj);
4265
4266 if (newpos<0)
4267 newpos = insize+newpos;
4268 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004269 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004270 goto onError;
4271 }
4272
4273 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4274 if (repwstr == NULL)
4275 goto onError;
4276 /* need more space? (at least enough for what we
4277 have+the replacement+the rest of the string (starting
4278 at the new input position), so we won't have to check space
4279 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004280 requiredsize = *outpos;
4281 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4282 goto overflow;
4283 requiredsize += repwlen;
4284 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4285 goto overflow;
4286 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004287 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004288 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004289 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004290 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004291 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004292 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004293 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004294 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004295 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004296 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004297 *endinpos = newpos;
4298 *inptr = *input + newpos;
4299
4300 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004301 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004302 return 0;
4303
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004304 overflow:
4305 PyErr_SetString(PyExc_OverflowError,
4306 "decoded result is too long for a Python string");
4307
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004308 onError:
4309 Py_XDECREF(restuple);
4310 return -1;
4311}
Steve Dowercc16be82016-09-08 10:35:16 -07004312#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004313
4314static int
4315unicode_decode_call_errorhandler_writer(
4316 const char *errors, PyObject **errorHandler,
4317 const char *encoding, const char *reason,
4318 const char **input, const char **inend, Py_ssize_t *startinpos,
4319 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4320 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4321{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004322 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004323
4324 PyObject *restuple = NULL;
4325 PyObject *repunicode = NULL;
4326 Py_ssize_t insize;
4327 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004328 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004329 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004330 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004331 int need_to_grow = 0;
4332 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004333
4334 if (*errorHandler == NULL) {
4335 *errorHandler = PyCodec_LookupError(errors);
4336 if (*errorHandler == NULL)
4337 goto onError;
4338 }
4339
4340 make_decode_exception(exceptionObject,
4341 encoding,
4342 *input, *inend - *input,
4343 *startinpos, *endinpos,
4344 reason);
4345 if (*exceptionObject == NULL)
4346 goto onError;
4347
Petr Viktorinffd97532020-02-11 17:46:57 +01004348 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004349 if (restuple == NULL)
4350 goto onError;
4351 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004352 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004353 goto onError;
4354 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004355 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004356 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004357
4358 /* Copy back the bytes variables, which might have been modified by the
4359 callback */
4360 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4361 if (!inputobj)
4362 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004363 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004364 *input = PyBytes_AS_STRING(inputobj);
4365 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004366 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004367 /* we can DECREF safely, as the exception has another reference,
4368 so the object won't go away. */
4369 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004370
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004371 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004372 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004373 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004374 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004375 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004376 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004377
Victor Stinner170ca6f2013-04-18 00:25:28 +02004378 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004379 if (replen > 1) {
4380 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004381 need_to_grow = 1;
4382 }
4383 new_inptr = *input + newpos;
4384 if (*inend - new_inptr > remain) {
4385 /* We don't know the decoding algorithm here so we make the worst
4386 assumption that one byte decodes to one unicode character.
4387 If unfortunately one byte could decode to more unicode characters,
4388 the decoder may write out-of-bound then. Is it possible for the
4389 algorithms using this function? */
4390 writer->min_length += *inend - new_inptr - remain;
4391 need_to_grow = 1;
4392 }
4393 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004394 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004395 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004396 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4397 goto onError;
4398 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004399 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004400 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004401
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004402 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004403 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004404
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004405 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004406 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004407 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004408
Benjamin Peterson29060642009-01-31 22:14:21 +00004409 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004410 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004411 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004412}
4413
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004414/* --- UTF-7 Codec -------------------------------------------------------- */
4415
Antoine Pitrou244651a2009-05-04 18:56:13 +00004416/* See RFC2152 for details. We encode conservatively and decode liberally. */
4417
4418/* Three simple macros defining base-64. */
4419
4420/* Is c a base-64 character? */
4421
4422#define IS_BASE64(c) \
4423 (((c) >= 'A' && (c) <= 'Z') || \
4424 ((c) >= 'a' && (c) <= 'z') || \
4425 ((c) >= '0' && (c) <= '9') || \
4426 (c) == '+' || (c) == '/')
4427
4428/* given that c is a base-64 character, what is its base-64 value? */
4429
4430#define FROM_BASE64(c) \
4431 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4432 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4433 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4434 (c) == '+' ? 62 : 63)
4435
4436/* What is the base-64 character of the bottom 6 bits of n? */
4437
4438#define TO_BASE64(n) \
4439 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4440
4441/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4442 * decoded as itself. We are permissive on decoding; the only ASCII
4443 * byte not decoding to itself is the + which begins a base64
4444 * string. */
4445
4446#define DECODE_DIRECT(c) \
4447 ((c) <= 127 && (c) != '+')
4448
4449/* The UTF-7 encoder treats ASCII characters differently according to
4450 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4451 * the above). See RFC2152. This array identifies these different
4452 * sets:
4453 * 0 : "Set D"
4454 * alphanumeric and '(),-./:?
4455 * 1 : "Set O"
4456 * !"#$%&*;<=>@[]^_`{|}
4457 * 2 : "whitespace"
4458 * ht nl cr sp
4459 * 3 : special (must be base64 encoded)
4460 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4461 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004462
Tim Petersced69f82003-09-16 20:30:58 +00004463static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004464char utf7_category[128] = {
4465/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4466 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4467/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4468 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4469/* sp ! " # $ % & ' ( ) * + , - . / */
4470 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4471/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4472 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4473/* @ A B C D E F G H I J K L M N O */
4474 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4475/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4476 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4477/* ` a b c d e f g h i j k l m n o */
4478 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4479/* p q r s t u v w x y z { | } ~ del */
4480 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004481};
4482
Antoine Pitrou244651a2009-05-04 18:56:13 +00004483/* ENCODE_DIRECT: this character should be encoded as itself. The
4484 * answer depends on whether we are encoding set O as itself, and also
4485 * on whether we are encoding whitespace as itself. RFC2152 makes it
4486 * clear that the answers to these questions vary between
4487 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004488
Antoine Pitrou244651a2009-05-04 18:56:13 +00004489#define ENCODE_DIRECT(c, directO, directWS) \
4490 ((c) < 128 && (c) > 0 && \
4491 ((utf7_category[(c)] == 0) || \
4492 (directWS && (utf7_category[(c)] == 2)) || \
4493 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004494
Alexander Belopolsky40018472011-02-26 01:02:56 +00004495PyObject *
4496PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004497 Py_ssize_t size,
4498 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004499{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004500 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4501}
4502
Antoine Pitrou244651a2009-05-04 18:56:13 +00004503/* The decoder. The only state we preserve is our read position,
4504 * i.e. how many characters we have consumed. So if we end in the
4505 * middle of a shift sequence we have to back off the read position
4506 * and the output to the beginning of the sequence, otherwise we lose
4507 * all the shift state (seen bits, number of bits seen, high
4508 * surrogate). */
4509
Alexander Belopolsky40018472011-02-26 01:02:56 +00004510PyObject *
4511PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004512 Py_ssize_t size,
4513 const char *errors,
4514 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004515{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004516 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004517 Py_ssize_t startinpos;
4518 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004519 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004520 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004521 const char *errmsg = "";
4522 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004523 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004524 unsigned int base64bits = 0;
4525 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004526 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004527 PyObject *errorHandler = NULL;
4528 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004529
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004530 if (size == 0) {
4531 if (consumed)
4532 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004533 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004534 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004536 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004537 _PyUnicodeWriter_Init(&writer);
4538 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004539
4540 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004541 e = s + size;
4542
4543 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004544 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004545 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004546 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004547
Antoine Pitrou244651a2009-05-04 18:56:13 +00004548 if (inShift) { /* in a base-64 section */
4549 if (IS_BASE64(ch)) { /* consume a base-64 character */
4550 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4551 base64bits += 6;
4552 s++;
4553 if (base64bits >= 16) {
4554 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004555 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004556 base64bits -= 16;
4557 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004558 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004559 if (surrogate) {
4560 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004561 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4562 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004563 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004564 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004565 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004566 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004567 }
4568 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004569 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004570 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004571 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004572 }
4573 }
Victor Stinner551ac952011-11-29 22:58:13 +01004574 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004575 /* first surrogate */
4576 surrogate = outCh;
4577 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004578 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004579 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004580 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581 }
4582 }
4583 }
4584 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004585 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004586 if (base64bits > 0) { /* left-over bits */
4587 if (base64bits >= 6) {
4588 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004589 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004590 errmsg = "partial character in shift sequence";
4591 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004592 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004593 else {
4594 /* Some bits remain; they should be zero */
4595 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004596 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004597 errmsg = "non-zero padding bits in shift sequence";
4598 goto utf7Error;
4599 }
4600 }
4601 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004602 if (surrogate && DECODE_DIRECT(ch)) {
4603 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4604 goto onError;
4605 }
4606 surrogate = 0;
4607 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004608 /* '-' is absorbed; other terminating
4609 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004610 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004611 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004612 }
4613 }
4614 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004615 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004616 s++; /* consume '+' */
4617 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004618 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004619 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004620 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004621 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004622 else if (s < e && !IS_BASE64(*s)) {
4623 s++;
4624 errmsg = "ill-formed sequence";
4625 goto utf7Error;
4626 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004627 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004628 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004629 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004630 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004631 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004632 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004633 }
4634 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004635 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004636 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004637 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004638 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004639 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004640 else {
4641 startinpos = s-starts;
4642 s++;
4643 errmsg = "unexpected special character";
4644 goto utf7Error;
4645 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004646 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004647utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004648 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004649 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004650 errors, &errorHandler,
4651 "utf7", errmsg,
4652 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004653 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004654 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004655 }
4656
Antoine Pitrou244651a2009-05-04 18:56:13 +00004657 /* end of string */
4658
4659 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4660 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004661 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004662 if (surrogate ||
4663 (base64bits >= 6) ||
4664 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004665 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004666 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004667 errors, &errorHandler,
4668 "utf7", "unterminated shift sequence",
4669 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004670 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004671 goto onError;
4672 if (s < e)
4673 goto restart;
4674 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004675 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004676
4677 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004678 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004679 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004680 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004681 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004682 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004683 writer.kind, writer.data, shiftOutStart);
4684 Py_XDECREF(errorHandler);
4685 Py_XDECREF(exc);
4686 _PyUnicodeWriter_Dealloc(&writer);
4687 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004688 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004689 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004690 }
4691 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004692 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004693 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004694 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004695
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004696 Py_XDECREF(errorHandler);
4697 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004698 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004699
Benjamin Peterson29060642009-01-31 22:14:21 +00004700 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004701 Py_XDECREF(errorHandler);
4702 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004703 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004704 return NULL;
4705}
4706
4707
Alexander Belopolsky40018472011-02-26 01:02:56 +00004708PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004709_PyUnicode_EncodeUTF7(PyObject *str,
4710 int base64SetO,
4711 int base64WhiteSpace,
4712 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004713{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004714 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004715 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004716 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004717 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004718 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004719 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004720 unsigned int base64bits = 0;
4721 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004722 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004723 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004724
Benjamin Petersonbac79492012-01-14 13:34:47 -05004725 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004726 return NULL;
4727 kind = PyUnicode_KIND(str);
4728 data = PyUnicode_DATA(str);
4729 len = PyUnicode_GET_LENGTH(str);
4730
4731 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004732 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004733
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004734 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004735 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004736 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004737 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004738 if (v == NULL)
4739 return NULL;
4740
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004741 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004742 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004743 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004744
Antoine Pitrou244651a2009-05-04 18:56:13 +00004745 if (inShift) {
4746 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4747 /* shifting out */
4748 if (base64bits) { /* output remaining bits */
4749 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4750 base64buffer = 0;
4751 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004752 }
4753 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004754 /* Characters not in the BASE64 set implicitly unshift the sequence
4755 so no '-' is required, except if the character is itself a '-' */
4756 if (IS_BASE64(ch) || ch == '-') {
4757 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004758 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004759 *out++ = (char) ch;
4760 }
4761 else {
4762 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004763 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004764 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004765 else { /* not in a shift sequence */
4766 if (ch == '+') {
4767 *out++ = '+';
4768 *out++ = '-';
4769 }
4770 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4771 *out++ = (char) ch;
4772 }
4773 else {
4774 *out++ = '+';
4775 inShift = 1;
4776 goto encode_char;
4777 }
4778 }
4779 continue;
4780encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004781 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004782 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004783
Antoine Pitrou244651a2009-05-04 18:56:13 +00004784 /* code first surrogate */
4785 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004786 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004787 while (base64bits >= 6) {
4788 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4789 base64bits -= 6;
4790 }
4791 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004792 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004793 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004794 base64bits += 16;
4795 base64buffer = (base64buffer << 16) | ch;
4796 while (base64bits >= 6) {
4797 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4798 base64bits -= 6;
4799 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004800 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004801 if (base64bits)
4802 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4803 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004804 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004805 if (_PyBytes_Resize(&v, out - start) < 0)
4806 return NULL;
4807 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004808}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004809PyObject *
4810PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4811 Py_ssize_t size,
4812 int base64SetO,
4813 int base64WhiteSpace,
4814 const char *errors)
4815{
4816 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004817 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004818 if (tmp == NULL)
4819 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004820 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004821 base64WhiteSpace, errors);
4822 Py_DECREF(tmp);
4823 return result;
4824}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004825
Antoine Pitrou244651a2009-05-04 18:56:13 +00004826#undef IS_BASE64
4827#undef FROM_BASE64
4828#undef TO_BASE64
4829#undef DECODE_DIRECT
4830#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004831
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832/* --- UTF-8 Codec -------------------------------------------------------- */
4833
Alexander Belopolsky40018472011-02-26 01:02:56 +00004834PyObject *
4835PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004836 Py_ssize_t size,
4837 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004838{
Walter Dörwald69652032004-09-07 20:24:22 +00004839 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4840}
4841
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004842#include "stringlib/asciilib.h"
4843#include "stringlib/codecs.h"
4844#include "stringlib/undef.h"
4845
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004846#include "stringlib/ucs1lib.h"
4847#include "stringlib/codecs.h"
4848#include "stringlib/undef.h"
4849
4850#include "stringlib/ucs2lib.h"
4851#include "stringlib/codecs.h"
4852#include "stringlib/undef.h"
4853
4854#include "stringlib/ucs4lib.h"
4855#include "stringlib/codecs.h"
4856#include "stringlib/undef.h"
4857
Antoine Pitrouab868312009-01-10 15:40:25 +00004858/* Mask to quickly check whether a C 'long' contains a
4859 non-ASCII, UTF8-encoded char. */
4860#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004861# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004862#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004863# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004864#else
4865# error C 'long' size should be either 4 or 8!
4866#endif
4867
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004868static Py_ssize_t
4869ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004870{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004871 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004872 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004873
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004874 /*
4875 * Issue #17237: m68k is a bit different from most architectures in
4876 * that objects do not use "natural alignment" - for example, int and
4877 * long are only aligned at 2-byte boundaries. Therefore the assert()
4878 * won't work; also, tests have shown that skipping the "optimised
4879 * version" will even speed up m68k.
4880 */
4881#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004882#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004883 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4884 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004885 /* Fast path, see in STRINGLIB(utf8_decode) for
4886 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004887 /* Help allocation */
4888 const char *_p = p;
4889 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004890 while (_p < aligned_end) {
4891 unsigned long value = *(const unsigned long *) _p;
4892 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004893 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004894 *((unsigned long *)q) = value;
4895 _p += SIZEOF_LONG;
4896 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004897 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004898 p = _p;
4899 while (p < end) {
4900 if ((unsigned char)*p & 0x80)
4901 break;
4902 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004903 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004904 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004905 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004906#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004907#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004908 while (p < end) {
4909 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4910 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004911 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004912 /* Help allocation */
4913 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004914 while (_p < aligned_end) {
Andy Lestere6be9b52020-02-11 20:28:35 -06004915 unsigned long value = *(const unsigned long *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004916 if (value & ASCII_CHAR_MASK)
4917 break;
4918 _p += SIZEOF_LONG;
4919 }
4920 p = _p;
4921 if (_p == end)
4922 break;
4923 }
4924 if ((unsigned char)*p & 0x80)
4925 break;
4926 ++p;
4927 }
4928 memcpy(dest, start, p - start);
4929 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004930}
Antoine Pitrouab868312009-01-10 15:40:25 +00004931
Victor Stinner709d23d2019-05-02 14:56:30 -04004932static PyObject *
4933unicode_decode_utf8(const char *s, Py_ssize_t size,
4934 _Py_error_handler error_handler, const char *errors,
4935 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004936{
Victor Stinner785938e2011-12-11 20:09:03 +01004937 if (size == 0) {
4938 if (consumed)
4939 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004940 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004941 }
4942
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004943 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4944 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004945 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004946 *consumed = 1;
4947 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004948 }
4949
Inada Naoki770847a2019-06-24 12:30:24 +09004950 const char *starts = s;
4951 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01004952
Inada Naoki770847a2019-06-24 12:30:24 +09004953 // fast path: try ASCII string.
4954 PyObject *u = PyUnicode_New(size, 127);
4955 if (u == NULL) {
4956 return NULL;
4957 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004958 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09004959 if (s == end) {
4960 return u;
4961 }
4962
4963 // Use _PyUnicodeWriter after fast path is failed.
4964 _PyUnicodeWriter writer;
4965 _PyUnicodeWriter_InitWithBuffer(&writer, u);
4966 writer.pos = s - starts;
4967
4968 Py_ssize_t startinpos, endinpos;
4969 const char *errmsg = "";
4970 PyObject *error_handler_obj = NULL;
4971 PyObject *exc = NULL;
4972
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004973 while (s < end) {
4974 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004975 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004976
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004977 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004978 if (PyUnicode_IS_ASCII(writer.buffer))
4979 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004980 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004981 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004982 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004983 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004984 } else {
4985 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004986 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004987 }
4988
4989 switch (ch) {
4990 case 0:
4991 if (s == end || consumed)
4992 goto End;
4993 errmsg = "unexpected end of data";
4994 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004995 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004996 break;
4997 case 1:
4998 errmsg = "invalid start byte";
4999 startinpos = s - starts;
5000 endinpos = startinpos + 1;
5001 break;
5002 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005003 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5004 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5005 {
5006 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005007 goto End;
5008 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005009 /* fall through */
5010 case 3:
5011 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005012 errmsg = "invalid continuation byte";
5013 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005014 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005015 break;
5016 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005017 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005018 goto onError;
5019 continue;
5020 }
5021
Victor Stinner1d65d912015-10-05 13:43:50 +02005022 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005023 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005024
5025 switch (error_handler) {
5026 case _Py_ERROR_IGNORE:
5027 s += (endinpos - startinpos);
5028 break;
5029
5030 case _Py_ERROR_REPLACE:
5031 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5032 goto onError;
5033 s += (endinpos - startinpos);
5034 break;
5035
5036 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005037 {
5038 Py_ssize_t i;
5039
Victor Stinner1d65d912015-10-05 13:43:50 +02005040 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5041 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005042 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005043 ch = (Py_UCS4)(unsigned char)(starts[i]);
5044 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5045 ch + 0xdc00);
5046 writer.pos++;
5047 }
5048 s += (endinpos - startinpos);
5049 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005050 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005051
5052 default:
5053 if (unicode_decode_call_errorhandler_writer(
5054 errors, &error_handler_obj,
5055 "utf-8", errmsg,
5056 &starts, &end, &startinpos, &endinpos, &exc, &s,
5057 &writer))
5058 goto onError;
5059 }
Victor Stinner785938e2011-12-11 20:09:03 +01005060 }
5061
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005062End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005063 if (consumed)
5064 *consumed = s - starts;
5065
Victor Stinner1d65d912015-10-05 13:43:50 +02005066 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005067 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005068 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005069
5070onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005071 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005072 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005073 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005074 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005075}
5076
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005077
Victor Stinner709d23d2019-05-02 14:56:30 -04005078PyObject *
5079PyUnicode_DecodeUTF8Stateful(const char *s,
5080 Py_ssize_t size,
5081 const char *errors,
5082 Py_ssize_t *consumed)
5083{
5084 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5085}
5086
5087
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005088/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5089 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005090
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005091 On success, write a pointer to a newly allocated wide character string into
5092 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5093 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005094
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005095 On memory allocation failure, return -1.
5096
5097 On decoding error (if surrogateescape is zero), return -2. If wlen is
5098 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5099 is not NULL, write the decoding error message into *reason. */
5100int
5101_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005102 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005103{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005104 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005105 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005106 wchar_t *unicode;
5107 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005108
Victor Stinner3d4226a2018-08-29 22:21:32 +02005109 int surrogateescape = 0;
5110 int surrogatepass = 0;
5111 switch (errors)
5112 {
5113 case _Py_ERROR_STRICT:
5114 break;
5115 case _Py_ERROR_SURROGATEESCAPE:
5116 surrogateescape = 1;
5117 break;
5118 case _Py_ERROR_SURROGATEPASS:
5119 surrogatepass = 1;
5120 break;
5121 default:
5122 return -3;
5123 }
5124
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005125 /* Note: size will always be longer than the resulting Unicode
5126 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005127 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005128 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005129 }
5130
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005131 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005132 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005133 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005134 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005135
5136 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005137 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005138 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005139 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005140 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005141#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005142 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005143#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005144 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005145#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005146 if (ch > 0xFF) {
5147#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005148 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005149#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005150 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005151 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005152 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5153 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5154#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005155 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005156 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005157 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005158 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005159 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005160
5161 if (surrogateescape) {
5162 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5163 }
5164 else {
5165 /* Is it a valid three-byte code? */
5166 if (surrogatepass
5167 && (e - s) >= 3
5168 && (s[0] & 0xf0) == 0xe0
5169 && (s[1] & 0xc0) == 0x80
5170 && (s[2] & 0xc0) == 0x80)
5171 {
5172 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5173 s += 3;
5174 unicode[outpos++] = ch;
5175 }
5176 else {
5177 PyMem_RawFree(unicode );
5178 if (reason != NULL) {
5179 switch (ch) {
5180 case 0:
5181 *reason = "unexpected end of data";
5182 break;
5183 case 1:
5184 *reason = "invalid start byte";
5185 break;
5186 /* 2, 3, 4 */
5187 default:
5188 *reason = "invalid continuation byte";
5189 break;
5190 }
5191 }
5192 if (wlen != NULL) {
5193 *wlen = s - orig_s;
5194 }
5195 return -2;
5196 }
5197 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005198 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005199 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005200 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005201 if (wlen) {
5202 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005203 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005204 *wstr = unicode;
5205 return 0;
5206}
5207
Victor Stinner5f9cf232019-03-19 01:46:25 +01005208
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005209wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005210_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5211 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005212{
5213 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005214 int res = _Py_DecodeUTF8Ex(arg, arglen,
5215 &wstr, wlen,
5216 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005217 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005218 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5219 assert(res != -3);
5220 if (wlen) {
5221 *wlen = (size_t)res;
5222 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005223 return NULL;
5224 }
5225 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005226}
5227
Antoine Pitrouab868312009-01-10 15:40:25 +00005228
Victor Stinnere47e6982017-12-21 15:45:16 +01005229/* UTF-8 encoder using the surrogateescape error handler .
5230
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005231 On success, return 0 and write the newly allocated character string (use
5232 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005233
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005234 On encoding failure, return -2 and write the position of the invalid
5235 surrogate character into *error_pos (if error_pos is set) and the decoding
5236 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005237
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005238 On memory allocation failure, return -1. */
5239int
5240_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005241 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005242{
5243 const Py_ssize_t max_char_size = 4;
5244 Py_ssize_t len = wcslen(text);
5245
5246 assert(len >= 0);
5247
Victor Stinner3d4226a2018-08-29 22:21:32 +02005248 int surrogateescape = 0;
5249 int surrogatepass = 0;
5250 switch (errors)
5251 {
5252 case _Py_ERROR_STRICT:
5253 break;
5254 case _Py_ERROR_SURROGATEESCAPE:
5255 surrogateescape = 1;
5256 break;
5257 case _Py_ERROR_SURROGATEPASS:
5258 surrogatepass = 1;
5259 break;
5260 default:
5261 return -3;
5262 }
5263
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005264 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5265 return -1;
5266 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005267 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005268 if (raw_malloc) {
5269 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005270 }
5271 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005272 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005273 }
5274 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005275 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005276 }
5277
5278 char *p = bytes;
5279 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005280 for (i = 0; i < len; ) {
5281 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005282 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005283 i++;
5284#if Py_UNICODE_SIZE == 2
5285 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5286 && i < len
5287 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5288 {
5289 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5290 i++;
5291 }
5292#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005293
5294 if (ch < 0x80) {
5295 /* Encode ASCII */
5296 *p++ = (char) ch;
5297
5298 }
5299 else if (ch < 0x0800) {
5300 /* Encode Latin-1 */
5301 *p++ = (char)(0xc0 | (ch >> 6));
5302 *p++ = (char)(0x80 | (ch & 0x3f));
5303 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005304 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005305 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005306 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005307 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005308 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005309 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005310 if (reason != NULL) {
5311 *reason = "encoding error";
5312 }
5313 if (raw_malloc) {
5314 PyMem_RawFree(bytes);
5315 }
5316 else {
5317 PyMem_Free(bytes);
5318 }
5319 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005320 }
5321 *p++ = (char)(ch & 0xff);
5322 }
5323 else if (ch < 0x10000) {
5324 *p++ = (char)(0xe0 | (ch >> 12));
5325 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5326 *p++ = (char)(0x80 | (ch & 0x3f));
5327 }
5328 else { /* ch >= 0x10000 */
5329 assert(ch <= MAX_UNICODE);
5330 /* Encode UCS4 Unicode ordinals */
5331 *p++ = (char)(0xf0 | (ch >> 18));
5332 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5333 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5334 *p++ = (char)(0x80 | (ch & 0x3f));
5335 }
5336 }
5337 *p++ = '\0';
5338
5339 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005340 char *bytes2;
5341 if (raw_malloc) {
5342 bytes2 = PyMem_RawRealloc(bytes, final_size);
5343 }
5344 else {
5345 bytes2 = PyMem_Realloc(bytes, final_size);
5346 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005347 if (bytes2 == NULL) {
5348 if (error_pos != NULL) {
5349 *error_pos = (size_t)-1;
5350 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005351 if (raw_malloc) {
5352 PyMem_RawFree(bytes);
5353 }
5354 else {
5355 PyMem_Free(bytes);
5356 }
5357 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005358 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005359 *str = bytes2;
5360 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005361}
5362
5363
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005364/* Primary internal function which creates utf8 encoded bytes objects.
5365
5366 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005367 and allocate exactly as much space needed at the end. Else allocate the
5368 maximum possible needed (4 result bytes per Unicode character), and return
5369 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005370*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005371static PyObject *
5372unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5373 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005375 if (!PyUnicode_Check(unicode)) {
5376 PyErr_BadArgument();
5377 return NULL;
5378 }
5379
5380 if (PyUnicode_READY(unicode) == -1)
5381 return NULL;
5382
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005383 if (PyUnicode_UTF8(unicode))
5384 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5385 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005386
Inada Naoki02a4d572020-02-27 13:48:59 +09005387 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005388 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005389 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5390
5391 _PyBytesWriter writer;
5392 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005393
Benjamin Petersonead6b532011-12-20 17:23:42 -06005394 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005395 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005396 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005397 case PyUnicode_1BYTE_KIND:
5398 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5399 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005400 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5401 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005402 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005403 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5404 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005405 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005406 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5407 break;
Tim Peters602f7402002-04-27 18:03:26 +00005408 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005409
5410 if (end == NULL) {
5411 _PyBytesWriter_Dealloc(&writer);
5412 return NULL;
5413 }
5414 return _PyBytesWriter_Finish(&writer, end);
5415}
5416
5417static int
5418unicode_fill_utf8(PyObject *unicode)
5419{
5420 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5421 assert(!PyUnicode_IS_ASCII(unicode));
5422
5423 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005424 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005425 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5426
5427 _PyBytesWriter writer;
5428 char *end;
5429
5430 switch (kind) {
5431 default:
5432 Py_UNREACHABLE();
5433 case PyUnicode_1BYTE_KIND:
5434 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5435 _Py_ERROR_STRICT, NULL);
5436 break;
5437 case PyUnicode_2BYTE_KIND:
5438 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5439 _Py_ERROR_STRICT, NULL);
5440 break;
5441 case PyUnicode_4BYTE_KIND:
5442 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5443 _Py_ERROR_STRICT, NULL);
5444 break;
5445 }
5446 if (end == NULL) {
5447 _PyBytesWriter_Dealloc(&writer);
5448 return -1;
5449 }
5450
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005451 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005452 PyBytes_AS_STRING(writer.buffer);
5453 Py_ssize_t len = end - start;
5454
5455 char *cache = PyObject_MALLOC(len + 1);
5456 if (cache == NULL) {
5457 _PyBytesWriter_Dealloc(&writer);
5458 PyErr_NoMemory();
5459 return -1;
5460 }
5461 _PyUnicode_UTF8(unicode) = cache;
5462 _PyUnicode_UTF8_LENGTH(unicode) = len;
5463 memcpy(cache, start, len);
5464 cache[len] = '\0';
5465 _PyBytesWriter_Dealloc(&writer);
5466 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467}
5468
Alexander Belopolsky40018472011-02-26 01:02:56 +00005469PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005470_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5471{
5472 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5473}
5474
5475
5476PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005477PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5478 Py_ssize_t size,
5479 const char *errors)
5480{
5481 PyObject *v, *unicode;
5482
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005483 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005484 if (unicode == NULL)
5485 return NULL;
5486 v = _PyUnicode_AsUTF8String(unicode, errors);
5487 Py_DECREF(unicode);
5488 return v;
5489}
5490
5491PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005492PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005494 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495}
5496
Walter Dörwald41980ca2007-08-16 21:55:45 +00005497/* --- UTF-32 Codec ------------------------------------------------------- */
5498
5499PyObject *
5500PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005501 Py_ssize_t size,
5502 const char *errors,
5503 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005504{
5505 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5506}
5507
5508PyObject *
5509PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005510 Py_ssize_t size,
5511 const char *errors,
5512 int *byteorder,
5513 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005514{
5515 const char *starts = s;
5516 Py_ssize_t startinpos;
5517 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005518 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005519 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005520 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005521 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005522 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005523 PyObject *errorHandler = NULL;
5524 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005525
Andy Lestere6be9b52020-02-11 20:28:35 -06005526 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005527 e = q + size;
5528
5529 if (byteorder)
5530 bo = *byteorder;
5531
5532 /* Check for BOM marks (U+FEFF) in the input and adjust current
5533 byte order setting accordingly. In native mode, the leading BOM
5534 mark is skipped, in all other modes, it is copied to the output
5535 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005536 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005537 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005538 if (bom == 0x0000FEFF) {
5539 bo = -1;
5540 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005541 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005542 else if (bom == 0xFFFE0000) {
5543 bo = 1;
5544 q += 4;
5545 }
5546 if (byteorder)
5547 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005548 }
5549
Victor Stinnere64322e2012-10-30 23:12:47 +01005550 if (q == e) {
5551 if (consumed)
5552 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005553 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005554 }
5555
Victor Stinnere64322e2012-10-30 23:12:47 +01005556#ifdef WORDS_BIGENDIAN
5557 le = bo < 0;
5558#else
5559 le = bo <= 0;
5560#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005561 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005562
Victor Stinner8f674cc2013-04-17 23:02:17 +02005563 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005564 writer.min_length = (e - q + 3) / 4;
5565 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005566 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005567
Victor Stinnere64322e2012-10-30 23:12:47 +01005568 while (1) {
5569 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005570 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005571
Victor Stinnere64322e2012-10-30 23:12:47 +01005572 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005573 enum PyUnicode_Kind kind = writer.kind;
5574 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005575 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005576 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005577 if (le) {
5578 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005579 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005580 if (ch > maxch)
5581 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005582 if (kind != PyUnicode_1BYTE_KIND &&
5583 Py_UNICODE_IS_SURROGATE(ch))
5584 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005585 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005586 q += 4;
5587 } while (q <= last);
5588 }
5589 else {
5590 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005591 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005592 if (ch > maxch)
5593 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005594 if (kind != PyUnicode_1BYTE_KIND &&
5595 Py_UNICODE_IS_SURROGATE(ch))
5596 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005597 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005598 q += 4;
5599 } while (q <= last);
5600 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005601 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005602 }
5603
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005604 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005605 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005606 startinpos = ((const char *)q) - starts;
5607 endinpos = startinpos + 4;
5608 }
5609 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005610 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005611 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005612 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005613 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005614 startinpos = ((const char *)q) - starts;
5615 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005616 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005617 else {
5618 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005619 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005620 goto onError;
5621 q += 4;
5622 continue;
5623 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005624 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005625 startinpos = ((const char *)q) - starts;
5626 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005627 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005628
5629 /* The remaining input chars are ignored if the callback
5630 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005631 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005632 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005633 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005634 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005635 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005636 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005637 }
5638
Walter Dörwald41980ca2007-08-16 21:55:45 +00005639 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005640 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005641
Walter Dörwald41980ca2007-08-16 21:55:45 +00005642 Py_XDECREF(errorHandler);
5643 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005644 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005645
Benjamin Peterson29060642009-01-31 22:14:21 +00005646 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005647 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005648 Py_XDECREF(errorHandler);
5649 Py_XDECREF(exc);
5650 return NULL;
5651}
5652
5653PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005654_PyUnicode_EncodeUTF32(PyObject *str,
5655 const char *errors,
5656 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005657{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005658 enum PyUnicode_Kind kind;
5659 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005660 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005661 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005662 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005663#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005664 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005665#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005666 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005667#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005668 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005669 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005670 PyObject *errorHandler = NULL;
5671 PyObject *exc = NULL;
5672 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005673
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005674 if (!PyUnicode_Check(str)) {
5675 PyErr_BadArgument();
5676 return NULL;
5677 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005678 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005679 return NULL;
5680 kind = PyUnicode_KIND(str);
5681 data = PyUnicode_DATA(str);
5682 len = PyUnicode_GET_LENGTH(str);
5683
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005684 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005685 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005686 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005687 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005688 if (v == NULL)
5689 return NULL;
5690
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005691 /* output buffer is 4-bytes aligned */
5692 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005693 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005694 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005695 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005696 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005697 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005698
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005699 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005700 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005701 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005702 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005703 else
5704 encoding = "utf-32";
5705
5706 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005707 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5708 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005709 }
5710
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005711 pos = 0;
5712 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005713 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005714
5715 if (kind == PyUnicode_2BYTE_KIND) {
5716 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5717 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005718 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005719 else {
5720 assert(kind == PyUnicode_4BYTE_KIND);
5721 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5722 &out, native_ordering);
5723 }
5724 if (pos == len)
5725 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005726
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005727 rep = unicode_encode_call_errorhandler(
5728 errors, &errorHandler,
5729 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005730 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005731 if (!rep)
5732 goto error;
5733
5734 if (PyBytes_Check(rep)) {
5735 repsize = PyBytes_GET_SIZE(rep);
5736 if (repsize & 3) {
5737 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005738 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005739 "surrogates not allowed");
5740 goto error;
5741 }
5742 moreunits = repsize / 4;
5743 }
5744 else {
5745 assert(PyUnicode_Check(rep));
5746 if (PyUnicode_READY(rep) < 0)
5747 goto error;
5748 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5749 if (!PyUnicode_IS_ASCII(rep)) {
5750 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005751 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005752 "surrogates not allowed");
5753 goto error;
5754 }
5755 }
5756
5757 /* four bytes are reserved for each surrogate */
5758 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005759 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005760 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005761 /* integer overflow */
5762 PyErr_NoMemory();
5763 goto error;
5764 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005765 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005766 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005767 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005768 }
5769
5770 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005771 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005772 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005773 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005774 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005775 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5776 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005777 }
5778
5779 Py_CLEAR(rep);
5780 }
5781
5782 /* Cut back to size actually needed. This is necessary for, for example,
5783 encoding of a string containing isolated surrogates and the 'ignore'
5784 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005785 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005786 if (nsize != PyBytes_GET_SIZE(v))
5787 _PyBytes_Resize(&v, nsize);
5788 Py_XDECREF(errorHandler);
5789 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005790 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005791 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005792 error:
5793 Py_XDECREF(rep);
5794 Py_XDECREF(errorHandler);
5795 Py_XDECREF(exc);
5796 Py_XDECREF(v);
5797 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005798}
5799
Alexander Belopolsky40018472011-02-26 01:02:56 +00005800PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005801PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5802 Py_ssize_t size,
5803 const char *errors,
5804 int byteorder)
5805{
5806 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005807 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005808 if (tmp == NULL)
5809 return NULL;
5810 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5811 Py_DECREF(tmp);
5812 return result;
5813}
5814
5815PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005816PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005817{
Victor Stinnerb960b342011-11-20 19:12:52 +01005818 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005819}
5820
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821/* --- UTF-16 Codec ------------------------------------------------------- */
5822
Tim Peters772747b2001-08-09 22:21:55 +00005823PyObject *
5824PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005825 Py_ssize_t size,
5826 const char *errors,
5827 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828{
Walter Dörwald69652032004-09-07 20:24:22 +00005829 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5830}
5831
5832PyObject *
5833PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005834 Py_ssize_t size,
5835 const char *errors,
5836 int *byteorder,
5837 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005838{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005839 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005840 Py_ssize_t startinpos;
5841 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005842 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005843 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005844 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005845 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005846 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005847 PyObject *errorHandler = NULL;
5848 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005849 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850
Andy Lestere6be9b52020-02-11 20:28:35 -06005851 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005852 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853
5854 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005855 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005857 /* Check for BOM marks (U+FEFF) in the input and adjust current
5858 byte order setting accordingly. In native mode, the leading BOM
5859 mark is skipped, in all other modes, it is copied to the output
5860 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005861 if (bo == 0 && size >= 2) {
5862 const Py_UCS4 bom = (q[1] << 8) | q[0];
5863 if (bom == 0xFEFF) {
5864 q += 2;
5865 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005866 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005867 else if (bom == 0xFFFE) {
5868 q += 2;
5869 bo = 1;
5870 }
5871 if (byteorder)
5872 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005873 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874
Antoine Pitrou63065d72012-05-15 23:48:04 +02005875 if (q == e) {
5876 if (consumed)
5877 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005878 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005879 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005880
Christian Heimes743e0cd2012-10-17 23:52:17 +02005881#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005882 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005883 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005884#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005885 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005886 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005887#endif
Tim Peters772747b2001-08-09 22:21:55 +00005888
Antoine Pitrou63065d72012-05-15 23:48:04 +02005889 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005890 character count normally. Error handler will take care of
5891 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005892 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005893 writer.min_length = (e - q + 1) / 2;
5894 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005895 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005896
Antoine Pitrou63065d72012-05-15 23:48:04 +02005897 while (1) {
5898 Py_UCS4 ch = 0;
5899 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005900 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005901 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005902 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005903 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005904 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005905 native_ordering);
5906 else
5907 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005908 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005909 native_ordering);
5910 } else if (kind == PyUnicode_2BYTE_KIND) {
5911 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005912 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005913 native_ordering);
5914 } else {
5915 assert(kind == PyUnicode_4BYTE_KIND);
5916 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005917 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005918 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005919 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005920 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005921
Antoine Pitrou63065d72012-05-15 23:48:04 +02005922 switch (ch)
5923 {
5924 case 0:
5925 /* remaining byte at the end? (size should be even) */
5926 if (q == e || consumed)
5927 goto End;
5928 errmsg = "truncated data";
5929 startinpos = ((const char *)q) - starts;
5930 endinpos = ((const char *)e) - starts;
5931 break;
5932 /* The remaining input chars are ignored if the callback
5933 chooses to skip the input */
5934 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005935 q -= 2;
5936 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005937 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005938 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005939 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005940 endinpos = ((const char *)e) - starts;
5941 break;
5942 case 2:
5943 errmsg = "illegal encoding";
5944 startinpos = ((const char *)q) - 2 - starts;
5945 endinpos = startinpos + 2;
5946 break;
5947 case 3:
5948 errmsg = "illegal UTF-16 surrogate";
5949 startinpos = ((const char *)q) - 4 - starts;
5950 endinpos = startinpos + 2;
5951 break;
5952 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005953 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005954 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 continue;
5956 }
5957
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005958 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005959 errors,
5960 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005961 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005962 &starts,
5963 (const char **)&e,
5964 &startinpos,
5965 &endinpos,
5966 &exc,
5967 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005968 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005969 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970 }
5971
Antoine Pitrou63065d72012-05-15 23:48:04 +02005972End:
Walter Dörwald69652032004-09-07 20:24:22 +00005973 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005974 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005975
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005976 Py_XDECREF(errorHandler);
5977 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005978 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979
Benjamin Peterson29060642009-01-31 22:14:21 +00005980 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005981 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005982 Py_XDECREF(errorHandler);
5983 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984 return NULL;
5985}
5986
Tim Peters772747b2001-08-09 22:21:55 +00005987PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005988_PyUnicode_EncodeUTF16(PyObject *str,
5989 const char *errors,
5990 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005992 enum PyUnicode_Kind kind;
5993 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005994 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005995 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005996 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005997 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005998#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005999 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006000#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006001 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006002#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006003 const char *encoding;
6004 Py_ssize_t nsize, pos;
6005 PyObject *errorHandler = NULL;
6006 PyObject *exc = NULL;
6007 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006008
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006009 if (!PyUnicode_Check(str)) {
6010 PyErr_BadArgument();
6011 return NULL;
6012 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006013 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006014 return NULL;
6015 kind = PyUnicode_KIND(str);
6016 data = PyUnicode_DATA(str);
6017 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006018
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006019 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006020 if (kind == PyUnicode_4BYTE_KIND) {
6021 const Py_UCS4 *in = (const Py_UCS4 *)data;
6022 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006023 while (in < end) {
6024 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006025 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006026 }
6027 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006028 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006029 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006030 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006031 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006032 nsize = len + pairs + (byteorder == 0);
6033 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006034 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006036 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006038 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006039 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006040 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006041 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006042 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006043 }
6044 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006045 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006046 }
Tim Peters772747b2001-08-09 22:21:55 +00006047
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006048 if (kind == PyUnicode_1BYTE_KIND) {
6049 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6050 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006051 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006052
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006053 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006054 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006055 }
6056 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006057 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006058 }
6059 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006060 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006061 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006062
6063 pos = 0;
6064 while (pos < len) {
6065 Py_ssize_t repsize, moreunits;
6066
6067 if (kind == PyUnicode_2BYTE_KIND) {
6068 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6069 &out, native_ordering);
6070 }
6071 else {
6072 assert(kind == PyUnicode_4BYTE_KIND);
6073 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6074 &out, native_ordering);
6075 }
6076 if (pos == len)
6077 break;
6078
6079 rep = unicode_encode_call_errorhandler(
6080 errors, &errorHandler,
6081 encoding, "surrogates not allowed",
6082 str, &exc, pos, pos + 1, &pos);
6083 if (!rep)
6084 goto error;
6085
6086 if (PyBytes_Check(rep)) {
6087 repsize = PyBytes_GET_SIZE(rep);
6088 if (repsize & 1) {
6089 raise_encode_exception(&exc, encoding,
6090 str, pos - 1, pos,
6091 "surrogates not allowed");
6092 goto error;
6093 }
6094 moreunits = repsize / 2;
6095 }
6096 else {
6097 assert(PyUnicode_Check(rep));
6098 if (PyUnicode_READY(rep) < 0)
6099 goto error;
6100 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6101 if (!PyUnicode_IS_ASCII(rep)) {
6102 raise_encode_exception(&exc, encoding,
6103 str, pos - 1, pos,
6104 "surrogates not allowed");
6105 goto error;
6106 }
6107 }
6108
6109 /* two bytes are reserved for each surrogate */
6110 if (moreunits > 1) {
6111 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006112 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006113 /* integer overflow */
6114 PyErr_NoMemory();
6115 goto error;
6116 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006117 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006118 goto error;
6119 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6120 }
6121
6122 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006123 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006124 out += moreunits;
6125 } else /* rep is unicode */ {
6126 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6127 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6128 &out, native_ordering);
6129 }
6130
6131 Py_CLEAR(rep);
6132 }
6133
6134 /* Cut back to size actually needed. This is necessary for, for example,
6135 encoding of a string containing isolated surrogates and the 'ignore' handler
6136 is used. */
6137 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6138 if (nsize != PyBytes_GET_SIZE(v))
6139 _PyBytes_Resize(&v, nsize);
6140 Py_XDECREF(errorHandler);
6141 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006142 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006143 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006144 error:
6145 Py_XDECREF(rep);
6146 Py_XDECREF(errorHandler);
6147 Py_XDECREF(exc);
6148 Py_XDECREF(v);
6149 return NULL;
6150#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151}
6152
Alexander Belopolsky40018472011-02-26 01:02:56 +00006153PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006154PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6155 Py_ssize_t size,
6156 const char *errors,
6157 int byteorder)
6158{
6159 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006160 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006161 if (tmp == NULL)
6162 return NULL;
6163 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6164 Py_DECREF(tmp);
6165 return result;
6166}
6167
6168PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006169PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006171 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172}
6173
6174/* --- Unicode Escape Codec ----------------------------------------------- */
6175
Fredrik Lundh06d12682001-01-24 07:59:11 +00006176static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006177
Alexander Belopolsky40018472011-02-26 01:02:56 +00006178PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006179_PyUnicode_DecodeUnicodeEscape(const char *s,
6180 Py_ssize_t size,
6181 const char *errors,
6182 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006184 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006185 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006187 PyObject *errorHandler = NULL;
6188 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006189
Eric V. Smith42454af2016-10-31 09:22:08 -04006190 // so we can remember if we've seen an invalid escape char or not
6191 *first_invalid_escape = NULL;
6192
Victor Stinner62ec3312016-09-06 17:04:34 -07006193 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006194 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006195 }
6196 /* Escaped strings will always be longer than the resulting
6197 Unicode string, so we start with size here and then reduce the
6198 length after conversion to the true value.
6199 (but if the error callback returns a long replacement string
6200 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006201 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006202 writer.min_length = size;
6203 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6204 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006205 }
6206
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207 end = s + size;
6208 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006209 unsigned char c = (unsigned char) *s++;
6210 Py_UCS4 ch;
6211 int count;
6212 Py_ssize_t startinpos;
6213 Py_ssize_t endinpos;
6214 const char *message;
6215
6216#define WRITE_ASCII_CHAR(ch) \
6217 do { \
6218 assert(ch <= 127); \
6219 assert(writer.pos < writer.size); \
6220 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6221 } while(0)
6222
6223#define WRITE_CHAR(ch) \
6224 do { \
6225 if (ch <= writer.maxchar) { \
6226 assert(writer.pos < writer.size); \
6227 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6228 } \
6229 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6230 goto onError; \
6231 } \
6232 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233
6234 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006235 if (c != '\\') {
6236 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237 continue;
6238 }
6239
Victor Stinner62ec3312016-09-06 17:04:34 -07006240 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006242 if (s >= end) {
6243 message = "\\ at end of string";
6244 goto error;
6245 }
6246 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006247
Victor Stinner62ec3312016-09-06 17:04:34 -07006248 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006249 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250
Benjamin Peterson29060642009-01-31 22:14:21 +00006251 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006252 case '\n': continue;
6253 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6254 case '\'': WRITE_ASCII_CHAR('\''); continue;
6255 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6256 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006257 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006258 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6259 case 't': WRITE_ASCII_CHAR('\t'); continue;
6260 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6261 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006262 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006263 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006264 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006265 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266
Benjamin Peterson29060642009-01-31 22:14:21 +00006267 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268 case '0': case '1': case '2': case '3':
6269 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006270 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006271 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006272 ch = (ch<<3) + *s++ - '0';
6273 if (s < end && '0' <= *s && *s <= '7') {
6274 ch = (ch<<3) + *s++ - '0';
6275 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006277 WRITE_CHAR(ch);
6278 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279
Benjamin Peterson29060642009-01-31 22:14:21 +00006280 /* hex escapes */
6281 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006283 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006284 message = "truncated \\xXX escape";
6285 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286
Benjamin Peterson29060642009-01-31 22:14:21 +00006287 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006289 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006290 message = "truncated \\uXXXX escape";
6291 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292
Benjamin Peterson29060642009-01-31 22:14:21 +00006293 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006294 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006295 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006296 message = "truncated \\UXXXXXXXX escape";
6297 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006298 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006299 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006300 ch <<= 4;
6301 if (c >= '0' && c <= '9') {
6302 ch += c - '0';
6303 }
6304 else if (c >= 'a' && c <= 'f') {
6305 ch += c - ('a' - 10);
6306 }
6307 else if (c >= 'A' && c <= 'F') {
6308 ch += c - ('A' - 10);
6309 }
6310 else {
6311 break;
6312 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006313 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006314 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006315 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006316 }
6317
6318 /* when we get here, ch is a 32-bit unicode character */
6319 if (ch > MAX_UNICODE) {
6320 message = "illegal Unicode character";
6321 goto error;
6322 }
6323
6324 WRITE_CHAR(ch);
6325 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006326
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006328 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006329 if (ucnhash_CAPI == NULL) {
6330 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006331 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6332 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006333 if (ucnhash_CAPI == NULL) {
6334 PyErr_SetString(
6335 PyExc_UnicodeError,
6336 "\\N escapes not supported (can't load unicodedata module)"
6337 );
6338 goto onError;
6339 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006340 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006341
6342 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006343 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006344 const char *start = ++s;
6345 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006346 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006347 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006348 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006349 namelen = s - start;
6350 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006351 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006352 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006353 ch = 0xffffffff; /* in case 'getcode' messes up */
6354 if (namelen <= INT_MAX &&
6355 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6356 &ch, 0)) {
6357 assert(ch <= MAX_UNICODE);
6358 WRITE_CHAR(ch);
6359 continue;
6360 }
6361 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006362 }
6363 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006364 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006365
6366 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006367 if (*first_invalid_escape == NULL) {
6368 *first_invalid_escape = s-1; /* Back up one char, since we've
6369 already incremented s. */
6370 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006371 WRITE_ASCII_CHAR('\\');
6372 WRITE_CHAR(c);
6373 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006375
6376 error:
6377 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006378 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006379 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006380 errors, &errorHandler,
6381 "unicodeescape", message,
6382 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006383 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006384 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006385 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006386 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006387
6388#undef WRITE_ASCII_CHAR
6389#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006391
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006392 Py_XDECREF(errorHandler);
6393 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006394 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006395
Benjamin Peterson29060642009-01-31 22:14:21 +00006396 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006397 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006398 Py_XDECREF(errorHandler);
6399 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400 return NULL;
6401}
6402
Eric V. Smith42454af2016-10-31 09:22:08 -04006403PyObject *
6404PyUnicode_DecodeUnicodeEscape(const char *s,
6405 Py_ssize_t size,
6406 const char *errors)
6407{
6408 const char *first_invalid_escape;
6409 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6410 &first_invalid_escape);
6411 if (result == NULL)
6412 return NULL;
6413 if (first_invalid_escape != NULL) {
6414 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6415 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006416 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006417 Py_DECREF(result);
6418 return NULL;
6419 }
6420 }
6421 return result;
6422}
6423
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006424/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425
Alexander Belopolsky40018472011-02-26 01:02:56 +00006426PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006427PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006429 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006430 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006432 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006433 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006434 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435
Ezio Melottie7f90372012-10-05 03:33:31 +03006436 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006437 escape.
6438
Ezio Melottie7f90372012-10-05 03:33:31 +03006439 For UCS1 strings it's '\xxx', 4 bytes per source character.
6440 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6441 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006442 */
6443
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006444 if (!PyUnicode_Check(unicode)) {
6445 PyErr_BadArgument();
6446 return NULL;
6447 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006448 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006449 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006450 }
Victor Stinner358af132015-10-12 22:36:57 +02006451
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006452 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006453 if (len == 0) {
6454 return PyBytes_FromStringAndSize(NULL, 0);
6455 }
6456
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006457 kind = PyUnicode_KIND(unicode);
6458 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006459 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6460 bytes, and 1 byte characters 4. */
6461 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006462 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006463 return PyErr_NoMemory();
6464 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006465 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006466 if (repr == NULL) {
6467 return NULL;
6468 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006469
Victor Stinner62ec3312016-09-06 17:04:34 -07006470 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006471 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006472 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006473
Victor Stinner62ec3312016-09-06 17:04:34 -07006474 /* U+0000-U+00ff range */
6475 if (ch < 0x100) {
6476 if (ch >= ' ' && ch < 127) {
6477 if (ch != '\\') {
6478 /* Copy printable US ASCII as-is */
6479 *p++ = (char) ch;
6480 }
6481 /* Escape backslashes */
6482 else {
6483 *p++ = '\\';
6484 *p++ = '\\';
6485 }
6486 }
Victor Stinner358af132015-10-12 22:36:57 +02006487
Victor Stinner62ec3312016-09-06 17:04:34 -07006488 /* Map special whitespace to '\t', \n', '\r' */
6489 else if (ch == '\t') {
6490 *p++ = '\\';
6491 *p++ = 't';
6492 }
6493 else if (ch == '\n') {
6494 *p++ = '\\';
6495 *p++ = 'n';
6496 }
6497 else if (ch == '\r') {
6498 *p++ = '\\';
6499 *p++ = 'r';
6500 }
6501
6502 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6503 else {
6504 *p++ = '\\';
6505 *p++ = 'x';
6506 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6507 *p++ = Py_hexdigits[ch & 0x000F];
6508 }
Tim Petersced69f82003-09-16 20:30:58 +00006509 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006510 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006511 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512 *p++ = '\\';
6513 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006514 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6515 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6516 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6517 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006519 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6520 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006521
Victor Stinner62ec3312016-09-06 17:04:34 -07006522 /* Make sure that the first two digits are zero */
6523 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006524 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006525 *p++ = 'U';
6526 *p++ = '0';
6527 *p++ = '0';
6528 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6529 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6530 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6531 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6532 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6533 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006534 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536
Victor Stinner62ec3312016-09-06 17:04:34 -07006537 assert(p - PyBytes_AS_STRING(repr) > 0);
6538 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6539 return NULL;
6540 }
6541 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542}
6543
Alexander Belopolsky40018472011-02-26 01:02:56 +00006544PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006545PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6546 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006548 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006549 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006550 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006552 }
6553
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006554 result = PyUnicode_AsUnicodeEscapeString(tmp);
6555 Py_DECREF(tmp);
6556 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557}
6558
6559/* --- Raw Unicode Escape Codec ------------------------------------------- */
6560
Alexander Belopolsky40018472011-02-26 01:02:56 +00006561PyObject *
6562PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006563 Py_ssize_t size,
6564 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006566 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006567 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006569 PyObject *errorHandler = NULL;
6570 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006571
Victor Stinner62ec3312016-09-06 17:04:34 -07006572 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006573 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006574 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006575
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576 /* Escaped strings will always be longer than the resulting
6577 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006578 length after conversion to the true value. (But decoding error
6579 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006580 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006581 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006582 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6583 goto onError;
6584 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006585
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586 end = s + size;
6587 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006588 unsigned char c = (unsigned char) *s++;
6589 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006590 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006591 Py_ssize_t startinpos;
6592 Py_ssize_t endinpos;
6593 const char *message;
6594
6595#define WRITE_CHAR(ch) \
6596 do { \
6597 if (ch <= writer.maxchar) { \
6598 assert(writer.pos < writer.size); \
6599 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6600 } \
6601 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6602 goto onError; \
6603 } \
6604 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605
Benjamin Peterson29060642009-01-31 22:14:21 +00006606 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006607 if (c != '\\' || s >= end) {
6608 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006609 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006610 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006611
Victor Stinner62ec3312016-09-06 17:04:34 -07006612 c = (unsigned char) *s++;
6613 if (c == 'u') {
6614 count = 4;
6615 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006616 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006617 else if (c == 'U') {
6618 count = 8;
6619 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006620 }
6621 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006622 assert(writer.pos < writer.size);
6623 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6624 WRITE_CHAR(c);
6625 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006626 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006627 startinpos = s - starts - 2;
6628
6629 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6630 for (ch = 0; count && s < end; ++s, --count) {
6631 c = (unsigned char)*s;
6632 ch <<= 4;
6633 if (c >= '0' && c <= '9') {
6634 ch += c - '0';
6635 }
6636 else if (c >= 'a' && c <= 'f') {
6637 ch += c - ('a' - 10);
6638 }
6639 else if (c >= 'A' && c <= 'F') {
6640 ch += c - ('A' - 10);
6641 }
6642 else {
6643 break;
6644 }
6645 }
6646 if (!count) {
6647 if (ch <= MAX_UNICODE) {
6648 WRITE_CHAR(ch);
6649 continue;
6650 }
6651 message = "\\Uxxxxxxxx out of range";
6652 }
6653
6654 endinpos = s-starts;
6655 writer.min_length = end - s + writer.pos;
6656 if (unicode_decode_call_errorhandler_writer(
6657 errors, &errorHandler,
6658 "rawunicodeescape", message,
6659 &starts, &end, &startinpos, &endinpos, &exc, &s,
6660 &writer)) {
6661 goto onError;
6662 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006663 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006664
6665#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006667 Py_XDECREF(errorHandler);
6668 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006669 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006670
Benjamin Peterson29060642009-01-31 22:14:21 +00006671 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006672 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006673 Py_XDECREF(errorHandler);
6674 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006676
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677}
6678
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006679
Alexander Belopolsky40018472011-02-26 01:02:56 +00006680PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006681PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682{
Victor Stinner62ec3312016-09-06 17:04:34 -07006683 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006685 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006686 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006687 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006688 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006690 if (!PyUnicode_Check(unicode)) {
6691 PyErr_BadArgument();
6692 return NULL;
6693 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006694 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006695 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006696 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006697 kind = PyUnicode_KIND(unicode);
6698 data = PyUnicode_DATA(unicode);
6699 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006700 if (kind == PyUnicode_1BYTE_KIND) {
6701 return PyBytes_FromStringAndSize(data, len);
6702 }
Victor Stinner0e368262011-11-10 20:12:49 +01006703
Victor Stinner62ec3312016-09-06 17:04:34 -07006704 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6705 bytes, and 1 byte characters 4. */
6706 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006707
Victor Stinner62ec3312016-09-06 17:04:34 -07006708 if (len > PY_SSIZE_T_MAX / expandsize) {
6709 return PyErr_NoMemory();
6710 }
6711 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6712 if (repr == NULL) {
6713 return NULL;
6714 }
6715 if (len == 0) {
6716 return repr;
6717 }
6718
6719 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006720 for (pos = 0; pos < len; pos++) {
6721 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006722
Victor Stinner62ec3312016-09-06 17:04:34 -07006723 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6724 if (ch < 0x100) {
6725 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006726 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006727 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006728 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729 *p++ = '\\';
6730 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006731 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6732 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6733 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6734 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006736 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6737 else {
6738 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6739 *p++ = '\\';
6740 *p++ = 'U';
6741 *p++ = '0';
6742 *p++ = '0';
6743 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6744 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6745 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6746 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6747 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6748 *p++ = Py_hexdigits[ch & 15];
6749 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006751
Victor Stinner62ec3312016-09-06 17:04:34 -07006752 assert(p > PyBytes_AS_STRING(repr));
6753 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6754 return NULL;
6755 }
6756 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757}
6758
Alexander Belopolsky40018472011-02-26 01:02:56 +00006759PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006760PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6761 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006763 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006764 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006765 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006766 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006767 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6768 Py_DECREF(tmp);
6769 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770}
6771
6772/* --- Latin-1 Codec ------------------------------------------------------ */
6773
Alexander Belopolsky40018472011-02-26 01:02:56 +00006774PyObject *
6775PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006776 Py_ssize_t size,
6777 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006780 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781}
6782
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006783/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006784static void
6785make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006786 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006787 PyObject *unicode,
6788 Py_ssize_t startpos, Py_ssize_t endpos,
6789 const char *reason)
6790{
6791 if (*exceptionObject == NULL) {
6792 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006793 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006794 encoding, unicode, startpos, endpos, reason);
6795 }
6796 else {
6797 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6798 goto onError;
6799 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6800 goto onError;
6801 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6802 goto onError;
6803 return;
6804 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006805 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006806 }
6807}
6808
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006809/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006810static void
6811raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006812 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006813 PyObject *unicode,
6814 Py_ssize_t startpos, Py_ssize_t endpos,
6815 const char *reason)
6816{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006817 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006818 encoding, unicode, startpos, endpos, reason);
6819 if (*exceptionObject != NULL)
6820 PyCodec_StrictErrors(*exceptionObject);
6821}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006822
6823/* error handling callback helper:
6824 build arguments, call the callback and check the arguments,
6825 put the result into newpos and return the replacement string, which
6826 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006827static PyObject *
6828unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006829 PyObject **errorHandler,
6830 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006831 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006832 Py_ssize_t startpos, Py_ssize_t endpos,
6833 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006834{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006835 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006836 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006837 PyObject *restuple;
6838 PyObject *resunicode;
6839
6840 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006841 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006842 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006843 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006844 }
6845
Benjamin Petersonbac79492012-01-14 13:34:47 -05006846 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006847 return NULL;
6848 len = PyUnicode_GET_LENGTH(unicode);
6849
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006850 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006851 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006852 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006853 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006854
Petr Viktorinffd97532020-02-11 17:46:57 +01006855 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006856 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006857 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006858 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006859 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006860 Py_DECREF(restuple);
6861 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006862 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006863 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006864 &resunicode, newpos)) {
6865 Py_DECREF(restuple);
6866 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006867 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006868 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6869 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6870 Py_DECREF(restuple);
6871 return NULL;
6872 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006873 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006874 *newpos = len + *newpos;
6875 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006876 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006877 Py_DECREF(restuple);
6878 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006879 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006880 Py_INCREF(resunicode);
6881 Py_DECREF(restuple);
6882 return resunicode;
6883}
6884
Alexander Belopolsky40018472011-02-26 01:02:56 +00006885static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006886unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006887 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006888 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006889{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006890 /* input state */
6891 Py_ssize_t pos=0, size;
6892 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006893 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006894 /* pointer into the output */
6895 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006896 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6897 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006898 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006899 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006900 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006901 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006902 /* output object */
6903 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006904
Benjamin Petersonbac79492012-01-14 13:34:47 -05006905 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006906 return NULL;
6907 size = PyUnicode_GET_LENGTH(unicode);
6908 kind = PyUnicode_KIND(unicode);
6909 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006910 /* allocate enough for a simple encoding without
6911 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006912 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006913 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006914
6915 _PyBytesWriter_Init(&writer);
6916 str = _PyBytesWriter_Alloc(&writer, size);
6917 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006918 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006919
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006920 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006921 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006922
Benjamin Peterson29060642009-01-31 22:14:21 +00006923 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006924 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006925 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006926 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006927 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006928 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006929 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006930 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006931 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006932 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006933 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006934 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006935
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006936 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006937 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006938
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006939 /* Only overallocate the buffer if it's not the last write */
6940 writer.overallocate = (collend < size);
6941
Benjamin Peterson29060642009-01-31 22:14:21 +00006942 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006943 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006944 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006945
6946 switch (error_handler) {
6947 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006948 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006949 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006950
6951 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006952 memset(str, '?', collend - collstart);
6953 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006954 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006955 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006956 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006957 break;
Victor Stinner50149202015-09-22 00:26:54 +02006958
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006959 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006960 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006961 writer.min_size -= (collend - collstart);
6962 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006963 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006964 if (str == NULL)
6965 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006966 pos = collend;
6967 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006968
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006969 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006970 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006971 writer.min_size -= (collend - collstart);
6972 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006973 unicode, collstart, collend);
6974 if (str == NULL)
6975 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006976 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006977 break;
Victor Stinner50149202015-09-22 00:26:54 +02006978
Victor Stinnerc3713e92015-09-29 12:32:13 +02006979 case _Py_ERROR_SURROGATEESCAPE:
6980 for (i = collstart; i < collend; ++i) {
6981 ch = PyUnicode_READ(kind, data, i);
6982 if (ch < 0xdc80 || 0xdcff < ch) {
6983 /* Not a UTF-8b surrogate */
6984 break;
6985 }
6986 *str++ = (char)(ch - 0xdc00);
6987 ++pos;
6988 }
6989 if (i >= collend)
6990 break;
6991 collstart = pos;
6992 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006993 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006994
Benjamin Peterson29060642009-01-31 22:14:21 +00006995 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006996 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6997 encoding, reason, unicode, &exc,
6998 collstart, collend, &newpos);
6999 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007000 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007001
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007002 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007003 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007004
Victor Stinner6bd525b2015-10-09 13:10:05 +02007005 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007006 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007007 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007008 PyBytes_AS_STRING(rep),
7009 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007010 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007011 else {
7012 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007013
Victor Stinner6bd525b2015-10-09 13:10:05 +02007014 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007015 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007016
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007017 if (limit == 256 ?
7018 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7019 !PyUnicode_IS_ASCII(rep))
7020 {
7021 /* Not all characters are smaller than limit */
7022 raise_encode_exception(&exc, encoding, unicode,
7023 collstart, collend, reason);
7024 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007025 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007026 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7027 str = _PyBytesWriter_WriteBytes(&writer, str,
7028 PyUnicode_DATA(rep),
7029 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007030 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007031 if (str == NULL)
7032 goto onError;
7033
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007034 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007035 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007036 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007037
7038 /* If overallocation was disabled, ensure that it was the last
7039 write. Otherwise, we missed an optimization */
7040 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007041 }
7042 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007043
Victor Stinner50149202015-09-22 00:26:54 +02007044 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007045 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007046 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007047
7048 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007049 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007050 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007051 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007052 Py_XDECREF(exc);
7053 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007054}
7055
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007056/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007057PyObject *
7058PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007059 Py_ssize_t size,
7060 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007062 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007063 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007064 if (unicode == NULL)
7065 return NULL;
7066 result = unicode_encode_ucs1(unicode, errors, 256);
7067 Py_DECREF(unicode);
7068 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069}
7070
Alexander Belopolsky40018472011-02-26 01:02:56 +00007071PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007072_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073{
7074 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007075 PyErr_BadArgument();
7076 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007077 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007078 if (PyUnicode_READY(unicode) == -1)
7079 return NULL;
7080 /* Fast path: if it is a one-byte string, construct
7081 bytes object directly. */
7082 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7083 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7084 PyUnicode_GET_LENGTH(unicode));
7085 /* Non-Latin-1 characters present. Defer to above function to
7086 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007087 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007088}
7089
7090PyObject*
7091PyUnicode_AsLatin1String(PyObject *unicode)
7092{
7093 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007094}
7095
7096/* --- 7-bit ASCII Codec -------------------------------------------------- */
7097
Alexander Belopolsky40018472011-02-26 01:02:56 +00007098PyObject *
7099PyUnicode_DecodeASCII(const char *s,
7100 Py_ssize_t size,
7101 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007103 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007104 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007105 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007106 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007107 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007108
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007110 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007111
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02007113 if (size == 1 && (unsigned char)s[0] < 128)
7114 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007115
Inada Naoki770847a2019-06-24 12:30:24 +09007116 // Shortcut for simple case
7117 PyObject *u = PyUnicode_New(size, 127);
7118 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007119 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007120 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007121 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007122 if (outpos == size) {
7123 return u;
7124 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007125
Inada Naoki770847a2019-06-24 12:30:24 +09007126 _PyUnicodeWriter writer;
7127 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007128 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007129
Inada Naoki770847a2019-06-24 12:30:24 +09007130 s += outpos;
7131 int kind = writer.kind;
7132 void *data = writer.data;
7133 Py_ssize_t startinpos, endinpos;
7134
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007135 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007136 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007137 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007138 PyUnicode_WRITE(kind, data, writer.pos, c);
7139 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007140 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007141 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007142 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007143
7144 /* byte outsize range 0x00..0x7f: call the error handler */
7145
7146 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007147 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007148
7149 switch (error_handler)
7150 {
7151 case _Py_ERROR_REPLACE:
7152 case _Py_ERROR_SURROGATEESCAPE:
7153 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007154 but we may switch to UCS2 at the first write */
7155 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7156 goto onError;
7157 kind = writer.kind;
7158 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007159
7160 if (error_handler == _Py_ERROR_REPLACE)
7161 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7162 else
7163 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7164 writer.pos++;
7165 ++s;
7166 break;
7167
7168 case _Py_ERROR_IGNORE:
7169 ++s;
7170 break;
7171
7172 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007173 startinpos = s-starts;
7174 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007175 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007176 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007177 "ascii", "ordinal not in range(128)",
7178 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007179 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007180 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007181 kind = writer.kind;
7182 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007183 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007184 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007185 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007186 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007187 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007188
Benjamin Peterson29060642009-01-31 22:14:21 +00007189 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007190 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007191 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007192 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193 return NULL;
7194}
7195
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007196/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007197PyObject *
7198PyUnicode_EncodeASCII(const Py_UNICODE *p,
7199 Py_ssize_t size,
7200 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007202 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007203 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007204 if (unicode == NULL)
7205 return NULL;
7206 result = unicode_encode_ucs1(unicode, errors, 128);
7207 Py_DECREF(unicode);
7208 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007209}
7210
Alexander Belopolsky40018472011-02-26 01:02:56 +00007211PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007212_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213{
7214 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007215 PyErr_BadArgument();
7216 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007217 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007218 if (PyUnicode_READY(unicode) == -1)
7219 return NULL;
7220 /* Fast path: if it is an ASCII-only string, construct bytes object
7221 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007222 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007223 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7224 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007225 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007226}
7227
7228PyObject *
7229PyUnicode_AsASCIIString(PyObject *unicode)
7230{
7231 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232}
7233
Steve Dowercc16be82016-09-08 10:35:16 -07007234#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007235
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007236/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007237
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007238#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007239#define NEED_RETRY
7240#endif
7241
Steve Dower7ebdda02019-08-21 16:22:33 -07007242/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7243 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7244 both cases also and avoids partial characters overrunning the
7245 length limit in MultiByteToWideChar on Windows */
7246#define DECODING_CHUNK_SIZE (INT_MAX/4)
7247
Victor Stinner3a50e702011-10-18 21:21:00 +02007248#ifndef WC_ERR_INVALID_CHARS
7249# define WC_ERR_INVALID_CHARS 0x0080
7250#endif
7251
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007252static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007253code_page_name(UINT code_page, PyObject **obj)
7254{
7255 *obj = NULL;
7256 if (code_page == CP_ACP)
7257 return "mbcs";
7258 if (code_page == CP_UTF7)
7259 return "CP_UTF7";
7260 if (code_page == CP_UTF8)
7261 return "CP_UTF8";
7262
7263 *obj = PyBytes_FromFormat("cp%u", code_page);
7264 if (*obj == NULL)
7265 return NULL;
7266 return PyBytes_AS_STRING(*obj);
7267}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007268
Victor Stinner3a50e702011-10-18 21:21:00 +02007269static DWORD
7270decode_code_page_flags(UINT code_page)
7271{
7272 if (code_page == CP_UTF7) {
7273 /* The CP_UTF7 decoder only supports flags=0 */
7274 return 0;
7275 }
7276 else
7277 return MB_ERR_INVALID_CHARS;
7278}
7279
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007280/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007281 * Decode a byte string from a Windows code page into unicode object in strict
7282 * mode.
7283 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007284 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7285 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007286 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007287static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007288decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007289 wchar_t **buf,
7290 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007291 const char *in,
7292 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007293{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007294 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007295 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007296 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007297
7298 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007299 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007300 while ((outsize = MultiByteToWideChar(code_page, flags,
7301 in, insize, NULL, 0)) <= 0)
7302 {
7303 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7304 goto error;
7305 }
7306 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7307 flags = 0;
7308 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007309
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007310 /* Extend a wchar_t* buffer */
7311 Py_ssize_t n = *bufsize; /* Get the current length */
7312 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7313 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007314 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007315 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007316
7317 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007318 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7319 if (outsize <= 0)
7320 goto error;
7321 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007322
Victor Stinner3a50e702011-10-18 21:21:00 +02007323error:
7324 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7325 return -2;
7326 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007327 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007328}
7329
Victor Stinner3a50e702011-10-18 21:21:00 +02007330/*
7331 * Decode a byte string from a code page into unicode object with an error
7332 * handler.
7333 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007334 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007335 * UnicodeDecodeError exception and returns -1 on error.
7336 */
7337static int
7338decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007339 wchar_t **buf,
7340 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007341 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007342 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007343{
7344 const char *startin = in;
7345 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007346 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007347 /* Ideally, we should get reason from FormatMessage. This is the Windows
7348 2000 English version of the message. */
7349 const char *reason = "No mapping for the Unicode character exists "
7350 "in the target code page.";
7351 /* each step cannot decode more than 1 character, but a character can be
7352 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007353 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007354 int insize;
7355 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007356 PyObject *errorHandler = NULL;
7357 PyObject *exc = NULL;
7358 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007359 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007360 DWORD err;
7361 int ret = -1;
7362
7363 assert(size > 0);
7364
7365 encoding = code_page_name(code_page, &encoding_obj);
7366 if (encoding == NULL)
7367 return -1;
7368
Victor Stinner7d00cc12014-03-17 23:08:06 +01007369 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007370 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7371 UnicodeDecodeError. */
7372 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7373 if (exc != NULL) {
7374 PyCodec_StrictErrors(exc);
7375 Py_CLEAR(exc);
7376 }
7377 goto error;
7378 }
7379
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007380 /* Extend a wchar_t* buffer */
7381 Py_ssize_t n = *bufsize; /* Get the current length */
7382 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7383 PyErr_NoMemory();
7384 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007385 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007386 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7387 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007388 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007389 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007390
7391 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007392 while (in < endin)
7393 {
7394 /* Decode a character */
7395 insize = 1;
7396 do
7397 {
7398 outsize = MultiByteToWideChar(code_page, flags,
7399 in, insize,
7400 buffer, Py_ARRAY_LENGTH(buffer));
7401 if (outsize > 0)
7402 break;
7403 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007404 if (err == ERROR_INVALID_FLAGS && flags) {
7405 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7406 flags = 0;
7407 continue;
7408 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007409 if (err != ERROR_NO_UNICODE_TRANSLATION
7410 && err != ERROR_INSUFFICIENT_BUFFER)
7411 {
7412 PyErr_SetFromWindowsErr(0);
7413 goto error;
7414 }
7415 insize++;
7416 }
7417 /* 4=maximum length of a UTF-8 sequence */
7418 while (insize <= 4 && (in + insize) <= endin);
7419
7420 if (outsize <= 0) {
7421 Py_ssize_t startinpos, endinpos, outpos;
7422
Victor Stinner7d00cc12014-03-17 23:08:06 +01007423 /* last character in partial decode? */
7424 if (in + insize >= endin && !final)
7425 break;
7426
Victor Stinner3a50e702011-10-18 21:21:00 +02007427 startinpos = in - startin;
7428 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007429 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007430 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007431 errors, &errorHandler,
7432 encoding, reason,
7433 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007434 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007435 {
7436 goto error;
7437 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007438 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007439 }
7440 else {
7441 in += insize;
7442 memcpy(out, buffer, outsize * sizeof(wchar_t));
7443 out += outsize;
7444 }
7445 }
7446
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007447 /* Shrink the buffer */
7448 assert(out - *buf <= *bufsize);
7449 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007450 /* (in - startin) <= size and size is an int */
7451 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007452
7453error:
7454 Py_XDECREF(encoding_obj);
7455 Py_XDECREF(errorHandler);
7456 Py_XDECREF(exc);
7457 return ret;
7458}
7459
Victor Stinner3a50e702011-10-18 21:21:00 +02007460static PyObject *
7461decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007462 const char *s, Py_ssize_t size,
7463 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007464{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007465 wchar_t *buf = NULL;
7466 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007467 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007468
Victor Stinner3a50e702011-10-18 21:21:00 +02007469 if (code_page < 0) {
7470 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7471 return NULL;
7472 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007473 if (size < 0) {
7474 PyErr_BadInternalCall();
7475 return NULL;
7476 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007477
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007478 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007479 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007480
Victor Stinner76a31a62011-11-04 00:05:13 +01007481 do
7482 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007483#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007484 if (size > DECODING_CHUNK_SIZE) {
7485 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007486 final = 0;
7487 done = 0;
7488 }
7489 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007490#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007491 {
7492 chunk_size = (int)size;
7493 final = (consumed == NULL);
7494 done = 1;
7495 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007496
Victor Stinner76a31a62011-11-04 00:05:13 +01007497 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007498 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007499 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007500 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007501 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007502
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007503 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007504 s, chunk_size);
7505 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007506 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007507 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007508 errors, final);
7509 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007510
7511 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007512 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007513 return NULL;
7514 }
7515
7516 if (consumed)
7517 *consumed += converted;
7518
7519 s += converted;
7520 size -= converted;
7521 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007522
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007523 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7524 PyMem_Free(buf);
7525 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007526}
7527
Alexander Belopolsky40018472011-02-26 01:02:56 +00007528PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007529PyUnicode_DecodeCodePageStateful(int code_page,
7530 const char *s,
7531 Py_ssize_t size,
7532 const char *errors,
7533 Py_ssize_t *consumed)
7534{
7535 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7536}
7537
7538PyObject *
7539PyUnicode_DecodeMBCSStateful(const char *s,
7540 Py_ssize_t size,
7541 const char *errors,
7542 Py_ssize_t *consumed)
7543{
7544 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7545}
7546
7547PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007548PyUnicode_DecodeMBCS(const char *s,
7549 Py_ssize_t size,
7550 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007551{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007552 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7553}
7554
Victor Stinner3a50e702011-10-18 21:21:00 +02007555static DWORD
7556encode_code_page_flags(UINT code_page, const char *errors)
7557{
7558 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007559 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007560 }
7561 else if (code_page == CP_UTF7) {
7562 /* CP_UTF7 only supports flags=0 */
7563 return 0;
7564 }
7565 else {
7566 if (errors != NULL && strcmp(errors, "replace") == 0)
7567 return 0;
7568 else
7569 return WC_NO_BEST_FIT_CHARS;
7570 }
7571}
7572
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007573/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007574 * Encode a Unicode string to a Windows code page into a byte string in strict
7575 * mode.
7576 *
7577 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007578 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007579 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007580static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007581encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007582 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007583 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007584{
Victor Stinner554f3f02010-06-16 23:33:54 +00007585 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007586 BOOL *pusedDefaultChar = &usedDefaultChar;
7587 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007588 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007589 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007590 const DWORD flags = encode_code_page_flags(code_page, NULL);
7591 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007592 /* Create a substring so that we can get the UTF-16 representation
7593 of just the slice under consideration. */
7594 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007595
Martin v. Löwis3d325192011-11-04 18:23:06 +01007596 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007597
Victor Stinner3a50e702011-10-18 21:21:00 +02007598 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007599 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007600 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007601 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007602
Victor Stinner2fc507f2011-11-04 20:06:39 +01007603 substring = PyUnicode_Substring(unicode, offset, offset+len);
7604 if (substring == NULL)
7605 return -1;
7606 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7607 if (p == NULL) {
7608 Py_DECREF(substring);
7609 return -1;
7610 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007611 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007612
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007613 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007614 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007615 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007616 NULL, 0,
7617 NULL, pusedDefaultChar);
7618 if (outsize <= 0)
7619 goto error;
7620 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007621 if (pusedDefaultChar && *pusedDefaultChar) {
7622 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007623 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007624 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007625
Victor Stinner3a50e702011-10-18 21:21:00 +02007626 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007627 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007628 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007629 if (*outbytes == NULL) {
7630 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007631 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007632 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007633 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007634 }
7635 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007636 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007637 const Py_ssize_t n = PyBytes_Size(*outbytes);
7638 if (outsize > PY_SSIZE_T_MAX - n) {
7639 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007640 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007641 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007642 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007643 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7644 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007645 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007646 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007647 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007648 }
7649
7650 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007651 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007652 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007653 out, outsize,
7654 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007655 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007656 if (outsize <= 0)
7657 goto error;
7658 if (pusedDefaultChar && *pusedDefaultChar)
7659 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007660 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007661
Victor Stinner3a50e702011-10-18 21:21:00 +02007662error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007663 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007664 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7665 return -2;
7666 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007667 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007668}
7669
Victor Stinner3a50e702011-10-18 21:21:00 +02007670/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007671 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007672 * error handler.
7673 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007674 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007675 * -1 on other error.
7676 */
7677static int
7678encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007679 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007680 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007681{
Victor Stinner3a50e702011-10-18 21:21:00 +02007682 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007683 Py_ssize_t pos = unicode_offset;
7684 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007685 /* Ideally, we should get reason from FormatMessage. This is the Windows
7686 2000 English version of the message. */
7687 const char *reason = "invalid character";
7688 /* 4=maximum length of a UTF-8 sequence */
7689 char buffer[4];
7690 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7691 Py_ssize_t outsize;
7692 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007693 PyObject *errorHandler = NULL;
7694 PyObject *exc = NULL;
7695 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007696 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007697 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007698 PyObject *rep;
7699 int ret = -1;
7700
7701 assert(insize > 0);
7702
7703 encoding = code_page_name(code_page, &encoding_obj);
7704 if (encoding == NULL)
7705 return -1;
7706
7707 if (errors == NULL || strcmp(errors, "strict") == 0) {
7708 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7709 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007710 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007711 if (exc != NULL) {
7712 PyCodec_StrictErrors(exc);
7713 Py_DECREF(exc);
7714 }
7715 Py_XDECREF(encoding_obj);
7716 return -1;
7717 }
7718
7719 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7720 pusedDefaultChar = &usedDefaultChar;
7721 else
7722 pusedDefaultChar = NULL;
7723
7724 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7725 PyErr_NoMemory();
7726 goto error;
7727 }
7728 outsize = insize * Py_ARRAY_LENGTH(buffer);
7729
7730 if (*outbytes == NULL) {
7731 /* Create string object */
7732 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7733 if (*outbytes == NULL)
7734 goto error;
7735 out = PyBytes_AS_STRING(*outbytes);
7736 }
7737 else {
7738 /* Extend string object */
7739 Py_ssize_t n = PyBytes_Size(*outbytes);
7740 if (n > PY_SSIZE_T_MAX - outsize) {
7741 PyErr_NoMemory();
7742 goto error;
7743 }
7744 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7745 goto error;
7746 out = PyBytes_AS_STRING(*outbytes) + n;
7747 }
7748
7749 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007750 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007751 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007752 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7753 wchar_t chars[2];
7754 int charsize;
7755 if (ch < 0x10000) {
7756 chars[0] = (wchar_t)ch;
7757 charsize = 1;
7758 }
7759 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007760 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7761 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007762 charsize = 2;
7763 }
7764
Victor Stinner3a50e702011-10-18 21:21:00 +02007765 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007766 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007767 buffer, Py_ARRAY_LENGTH(buffer),
7768 NULL, pusedDefaultChar);
7769 if (outsize > 0) {
7770 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7771 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007772 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007773 memcpy(out, buffer, outsize);
7774 out += outsize;
7775 continue;
7776 }
7777 }
7778 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7779 PyErr_SetFromWindowsErr(0);
7780 goto error;
7781 }
7782
Victor Stinner3a50e702011-10-18 21:21:00 +02007783 rep = unicode_encode_call_errorhandler(
7784 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007785 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007786 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007787 if (rep == NULL)
7788 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007789 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007790
7791 if (PyBytes_Check(rep)) {
7792 outsize = PyBytes_GET_SIZE(rep);
7793 if (outsize != 1) {
7794 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7795 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7796 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7797 Py_DECREF(rep);
7798 goto error;
7799 }
7800 out = PyBytes_AS_STRING(*outbytes) + offset;
7801 }
7802 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7803 out += outsize;
7804 }
7805 else {
7806 Py_ssize_t i;
7807 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007808 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02007809
Benjamin Petersonbac79492012-01-14 13:34:47 -05007810 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007811 Py_DECREF(rep);
7812 goto error;
7813 }
7814
7815 outsize = PyUnicode_GET_LENGTH(rep);
7816 if (outsize != 1) {
7817 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7818 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7819 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7820 Py_DECREF(rep);
7821 goto error;
7822 }
7823 out = PyBytes_AS_STRING(*outbytes) + offset;
7824 }
7825 kind = PyUnicode_KIND(rep);
7826 data = PyUnicode_DATA(rep);
7827 for (i=0; i < outsize; i++) {
7828 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7829 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007830 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007831 encoding, unicode,
7832 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007833 "unable to encode error handler result to ASCII");
7834 Py_DECREF(rep);
7835 goto error;
7836 }
7837 *out = (unsigned char)ch;
7838 out++;
7839 }
7840 }
7841 Py_DECREF(rep);
7842 }
7843 /* write a NUL byte */
7844 *out = 0;
7845 outsize = out - PyBytes_AS_STRING(*outbytes);
7846 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7847 if (_PyBytes_Resize(outbytes, outsize) < 0)
7848 goto error;
7849 ret = 0;
7850
7851error:
7852 Py_XDECREF(encoding_obj);
7853 Py_XDECREF(errorHandler);
7854 Py_XDECREF(exc);
7855 return ret;
7856}
7857
Victor Stinner3a50e702011-10-18 21:21:00 +02007858static PyObject *
7859encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007860 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007861 const char *errors)
7862{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007863 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007864 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007865 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007866 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007867
Victor Stinner29dacf22015-01-26 16:41:32 +01007868 if (!PyUnicode_Check(unicode)) {
7869 PyErr_BadArgument();
7870 return NULL;
7871 }
7872
Benjamin Petersonbac79492012-01-14 13:34:47 -05007873 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007874 return NULL;
7875 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007876
Victor Stinner3a50e702011-10-18 21:21:00 +02007877 if (code_page < 0) {
7878 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7879 return NULL;
7880 }
7881
Martin v. Löwis3d325192011-11-04 18:23:06 +01007882 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007883 return PyBytes_FromStringAndSize(NULL, 0);
7884
Victor Stinner7581cef2011-11-03 22:32:33 +01007885 offset = 0;
7886 do
7887 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007888#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007889 if (len > DECODING_CHUNK_SIZE) {
7890 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007891 done = 0;
7892 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007893 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007894#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007895 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007896 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007897 done = 1;
7898 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007899
Victor Stinner76a31a62011-11-04 00:05:13 +01007900 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007901 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007902 errors);
7903 if (ret == -2)
7904 ret = encode_code_page_errors(code_page, &outbytes,
7905 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007906 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007907 if (ret < 0) {
7908 Py_XDECREF(outbytes);
7909 return NULL;
7910 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007911
Victor Stinner7581cef2011-11-03 22:32:33 +01007912 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007913 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007914 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007915
Victor Stinner3a50e702011-10-18 21:21:00 +02007916 return outbytes;
7917}
7918
7919PyObject *
7920PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7921 Py_ssize_t size,
7922 const char *errors)
7923{
Victor Stinner7581cef2011-11-03 22:32:33 +01007924 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007925 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007926 if (unicode == NULL)
7927 return NULL;
7928 res = encode_code_page(CP_ACP, unicode, errors);
7929 Py_DECREF(unicode);
7930 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007931}
7932
7933PyObject *
7934PyUnicode_EncodeCodePage(int code_page,
7935 PyObject *unicode,
7936 const char *errors)
7937{
Victor Stinner7581cef2011-11-03 22:32:33 +01007938 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007939}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007940
Alexander Belopolsky40018472011-02-26 01:02:56 +00007941PyObject *
7942PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007943{
Victor Stinner7581cef2011-11-03 22:32:33 +01007944 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007945}
7946
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007947#undef NEED_RETRY
7948
Steve Dowercc16be82016-09-08 10:35:16 -07007949#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007950
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951/* --- Character Mapping Codec -------------------------------------------- */
7952
Victor Stinnerfb161b12013-04-18 01:44:27 +02007953static int
7954charmap_decode_string(const char *s,
7955 Py_ssize_t size,
7956 PyObject *mapping,
7957 const char *errors,
7958 _PyUnicodeWriter *writer)
7959{
7960 const char *starts = s;
7961 const char *e;
7962 Py_ssize_t startinpos, endinpos;
7963 PyObject *errorHandler = NULL, *exc = NULL;
7964 Py_ssize_t maplen;
7965 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007966 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007967 Py_UCS4 x;
7968 unsigned char ch;
7969
7970 if (PyUnicode_READY(mapping) == -1)
7971 return -1;
7972
7973 maplen = PyUnicode_GET_LENGTH(mapping);
7974 mapdata = PyUnicode_DATA(mapping);
7975 mapkind = PyUnicode_KIND(mapping);
7976
7977 e = s + size;
7978
7979 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7980 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7981 * is disabled in encoding aliases, latin1 is preferred because
7982 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007983 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007984 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7985 Py_UCS4 maxchar = writer->maxchar;
7986
7987 assert (writer->kind == PyUnicode_1BYTE_KIND);
7988 while (s < e) {
7989 ch = *s;
7990 x = mapdata_ucs1[ch];
7991 if (x > maxchar) {
7992 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7993 goto onError;
7994 maxchar = writer->maxchar;
7995 outdata = (Py_UCS1 *)writer->data;
7996 }
7997 outdata[writer->pos] = x;
7998 writer->pos++;
7999 ++s;
8000 }
8001 return 0;
8002 }
8003
8004 while (s < e) {
8005 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8006 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008007 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008008 if (outkind == PyUnicode_1BYTE_KIND) {
8009 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8010 Py_UCS4 maxchar = writer->maxchar;
8011 while (s < e) {
8012 ch = *s;
8013 x = mapdata_ucs2[ch];
8014 if (x > maxchar)
8015 goto Error;
8016 outdata[writer->pos] = x;
8017 writer->pos++;
8018 ++s;
8019 }
8020 break;
8021 }
8022 else if (outkind == PyUnicode_2BYTE_KIND) {
8023 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8024 while (s < e) {
8025 ch = *s;
8026 x = mapdata_ucs2[ch];
8027 if (x == 0xFFFE)
8028 goto Error;
8029 outdata[writer->pos] = x;
8030 writer->pos++;
8031 ++s;
8032 }
8033 break;
8034 }
8035 }
8036 ch = *s;
8037
8038 if (ch < maplen)
8039 x = PyUnicode_READ(mapkind, mapdata, ch);
8040 else
8041 x = 0xfffe; /* invalid value */
8042Error:
8043 if (x == 0xfffe)
8044 {
8045 /* undefined mapping */
8046 startinpos = s-starts;
8047 endinpos = startinpos+1;
8048 if (unicode_decode_call_errorhandler_writer(
8049 errors, &errorHandler,
8050 "charmap", "character maps to <undefined>",
8051 &starts, &e, &startinpos, &endinpos, &exc, &s,
8052 writer)) {
8053 goto onError;
8054 }
8055 continue;
8056 }
8057
8058 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8059 goto onError;
8060 ++s;
8061 }
8062 Py_XDECREF(errorHandler);
8063 Py_XDECREF(exc);
8064 return 0;
8065
8066onError:
8067 Py_XDECREF(errorHandler);
8068 Py_XDECREF(exc);
8069 return -1;
8070}
8071
8072static int
8073charmap_decode_mapping(const char *s,
8074 Py_ssize_t size,
8075 PyObject *mapping,
8076 const char *errors,
8077 _PyUnicodeWriter *writer)
8078{
8079 const char *starts = s;
8080 const char *e;
8081 Py_ssize_t startinpos, endinpos;
8082 PyObject *errorHandler = NULL, *exc = NULL;
8083 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008084 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008085
8086 e = s + size;
8087
8088 while (s < e) {
8089 ch = *s;
8090
8091 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8092 key = PyLong_FromLong((long)ch);
8093 if (key == NULL)
8094 goto onError;
8095
8096 item = PyObject_GetItem(mapping, key);
8097 Py_DECREF(key);
8098 if (item == NULL) {
8099 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8100 /* No mapping found means: mapping is undefined. */
8101 PyErr_Clear();
8102 goto Undefined;
8103 } else
8104 goto onError;
8105 }
8106
8107 /* Apply mapping */
8108 if (item == Py_None)
8109 goto Undefined;
8110 if (PyLong_Check(item)) {
8111 long value = PyLong_AS_LONG(item);
8112 if (value == 0xFFFE)
8113 goto Undefined;
8114 if (value < 0 || value > MAX_UNICODE) {
8115 PyErr_Format(PyExc_TypeError,
8116 "character mapping must be in range(0x%lx)",
8117 (unsigned long)MAX_UNICODE + 1);
8118 goto onError;
8119 }
8120
8121 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8122 goto onError;
8123 }
8124 else if (PyUnicode_Check(item)) {
8125 if (PyUnicode_READY(item) == -1)
8126 goto onError;
8127 if (PyUnicode_GET_LENGTH(item) == 1) {
8128 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8129 if (value == 0xFFFE)
8130 goto Undefined;
8131 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8132 goto onError;
8133 }
8134 else {
8135 writer->overallocate = 1;
8136 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8137 goto onError;
8138 }
8139 }
8140 else {
8141 /* wrong return value */
8142 PyErr_SetString(PyExc_TypeError,
8143 "character mapping must return integer, None or str");
8144 goto onError;
8145 }
8146 Py_CLEAR(item);
8147 ++s;
8148 continue;
8149
8150Undefined:
8151 /* undefined mapping */
8152 Py_CLEAR(item);
8153 startinpos = s-starts;
8154 endinpos = startinpos+1;
8155 if (unicode_decode_call_errorhandler_writer(
8156 errors, &errorHandler,
8157 "charmap", "character maps to <undefined>",
8158 &starts, &e, &startinpos, &endinpos, &exc, &s,
8159 writer)) {
8160 goto onError;
8161 }
8162 }
8163 Py_XDECREF(errorHandler);
8164 Py_XDECREF(exc);
8165 return 0;
8166
8167onError:
8168 Py_XDECREF(item);
8169 Py_XDECREF(errorHandler);
8170 Py_XDECREF(exc);
8171 return -1;
8172}
8173
Alexander Belopolsky40018472011-02-26 01:02:56 +00008174PyObject *
8175PyUnicode_DecodeCharmap(const char *s,
8176 Py_ssize_t size,
8177 PyObject *mapping,
8178 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008179{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008180 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008181
Guido van Rossumd57fd912000-03-10 22:53:23 +00008182 /* Default to Latin-1 */
8183 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008184 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185
Guido van Rossumd57fd912000-03-10 22:53:23 +00008186 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008187 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008188 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008189 writer.min_length = size;
8190 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008191 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008192
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008193 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008194 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8195 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008196 }
8197 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008198 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8199 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008201 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008202
Benjamin Peterson29060642009-01-31 22:14:21 +00008203 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008204 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008205 return NULL;
8206}
8207
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008208/* Charmap encoding: the lookup table */
8209
Alexander Belopolsky40018472011-02-26 01:02:56 +00008210struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008211 PyObject_HEAD
8212 unsigned char level1[32];
8213 int count2, count3;
8214 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008215};
8216
8217static PyObject*
8218encoding_map_size(PyObject *obj, PyObject* args)
8219{
8220 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008221 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008222 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008223}
8224
8225static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008226 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008227 PyDoc_STR("Return the size (in bytes) of this object") },
8228 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008229};
8230
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008231static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008232 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008233 "EncodingMap", /*tp_name*/
8234 sizeof(struct encoding_map), /*tp_basicsize*/
8235 0, /*tp_itemsize*/
8236 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008237 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008238 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008239 0, /*tp_getattr*/
8240 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008241 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008242 0, /*tp_repr*/
8243 0, /*tp_as_number*/
8244 0, /*tp_as_sequence*/
8245 0, /*tp_as_mapping*/
8246 0, /*tp_hash*/
8247 0, /*tp_call*/
8248 0, /*tp_str*/
8249 0, /*tp_getattro*/
8250 0, /*tp_setattro*/
8251 0, /*tp_as_buffer*/
8252 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8253 0, /*tp_doc*/
8254 0, /*tp_traverse*/
8255 0, /*tp_clear*/
8256 0, /*tp_richcompare*/
8257 0, /*tp_weaklistoffset*/
8258 0, /*tp_iter*/
8259 0, /*tp_iternext*/
8260 encoding_map_methods, /*tp_methods*/
8261 0, /*tp_members*/
8262 0, /*tp_getset*/
8263 0, /*tp_base*/
8264 0, /*tp_dict*/
8265 0, /*tp_descr_get*/
8266 0, /*tp_descr_set*/
8267 0, /*tp_dictoffset*/
8268 0, /*tp_init*/
8269 0, /*tp_alloc*/
8270 0, /*tp_new*/
8271 0, /*tp_free*/
8272 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008273};
8274
8275PyObject*
8276PyUnicode_BuildEncodingMap(PyObject* string)
8277{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008278 PyObject *result;
8279 struct encoding_map *mresult;
8280 int i;
8281 int need_dict = 0;
8282 unsigned char level1[32];
8283 unsigned char level2[512];
8284 unsigned char *mlevel1, *mlevel2, *mlevel3;
8285 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008286 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008287 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008288 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008289 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008290
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008291 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008292 PyErr_BadArgument();
8293 return NULL;
8294 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008295 kind = PyUnicode_KIND(string);
8296 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008297 length = PyUnicode_GET_LENGTH(string);
8298 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008299 memset(level1, 0xFF, sizeof level1);
8300 memset(level2, 0xFF, sizeof level2);
8301
8302 /* If there isn't a one-to-one mapping of NULL to \0,
8303 or if there are non-BMP characters, we need to use
8304 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008305 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008306 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008307 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008308 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008309 ch = PyUnicode_READ(kind, data, i);
8310 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008311 need_dict = 1;
8312 break;
8313 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008314 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008315 /* unmapped character */
8316 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008317 l1 = ch >> 11;
8318 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008319 if (level1[l1] == 0xFF)
8320 level1[l1] = count2++;
8321 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008322 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008323 }
8324
8325 if (count2 >= 0xFF || count3 >= 0xFF)
8326 need_dict = 1;
8327
8328 if (need_dict) {
8329 PyObject *result = PyDict_New();
8330 PyObject *key, *value;
8331 if (!result)
8332 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008333 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008334 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008335 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008336 if (!key || !value)
8337 goto failed1;
8338 if (PyDict_SetItem(result, key, value) == -1)
8339 goto failed1;
8340 Py_DECREF(key);
8341 Py_DECREF(value);
8342 }
8343 return result;
8344 failed1:
8345 Py_XDECREF(key);
8346 Py_XDECREF(value);
8347 Py_DECREF(result);
8348 return NULL;
8349 }
8350
8351 /* Create a three-level trie */
8352 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8353 16*count2 + 128*count3 - 1);
8354 if (!result)
8355 return PyErr_NoMemory();
8356 PyObject_Init(result, &EncodingMapType);
8357 mresult = (struct encoding_map*)result;
8358 mresult->count2 = count2;
8359 mresult->count3 = count3;
8360 mlevel1 = mresult->level1;
8361 mlevel2 = mresult->level23;
8362 mlevel3 = mresult->level23 + 16*count2;
8363 memcpy(mlevel1, level1, 32);
8364 memset(mlevel2, 0xFF, 16*count2);
8365 memset(mlevel3, 0, 128*count3);
8366 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008367 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008368 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008369 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8370 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008371 /* unmapped character */
8372 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008373 o1 = ch>>11;
8374 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008375 i2 = 16*mlevel1[o1] + o2;
8376 if (mlevel2[i2] == 0xFF)
8377 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008378 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008379 i3 = 128*mlevel2[i2] + o3;
8380 mlevel3[i3] = i;
8381 }
8382 return result;
8383}
8384
8385static int
Victor Stinner22168992011-11-20 17:09:18 +01008386encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008387{
8388 struct encoding_map *map = (struct encoding_map*)mapping;
8389 int l1 = c>>11;
8390 int l2 = (c>>7) & 0xF;
8391 int l3 = c & 0x7F;
8392 int i;
8393
Victor Stinner22168992011-11-20 17:09:18 +01008394 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008396 if (c == 0)
8397 return 0;
8398 /* level 1*/
8399 i = map->level1[l1];
8400 if (i == 0xFF) {
8401 return -1;
8402 }
8403 /* level 2*/
8404 i = map->level23[16*i+l2];
8405 if (i == 0xFF) {
8406 return -1;
8407 }
8408 /* level 3 */
8409 i = map->level23[16*map->count2 + 128*i + l3];
8410 if (i == 0) {
8411 return -1;
8412 }
8413 return i;
8414}
8415
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008416/* Lookup the character ch in the mapping. If the character
8417 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008418 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008419static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008420charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008421{
Christian Heimes217cfd12007-12-02 14:31:20 +00008422 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008423 PyObject *x;
8424
8425 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008426 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008427 x = PyObject_GetItem(mapping, w);
8428 Py_DECREF(w);
8429 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8431 /* No mapping found means: mapping is undefined. */
8432 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008433 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008434 } else
8435 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008436 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008437 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008439 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008440 long value = PyLong_AS_LONG(x);
8441 if (value < 0 || value > 255) {
8442 PyErr_SetString(PyExc_TypeError,
8443 "character mapping must be in range(256)");
8444 Py_DECREF(x);
8445 return NULL;
8446 }
8447 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008448 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008449 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008451 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 /* wrong return value */
8453 PyErr_Format(PyExc_TypeError,
8454 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008455 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 Py_DECREF(x);
8457 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008458 }
8459}
8460
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008461static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008462charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008463{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008464 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8465 /* exponentially overallocate to minimize reallocations */
8466 if (requiredsize < 2*outsize)
8467 requiredsize = 2*outsize;
8468 if (_PyBytes_Resize(outobj, requiredsize))
8469 return -1;
8470 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008471}
8472
Benjamin Peterson14339b62009-01-31 16:36:08 +00008473typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008475} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008476/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008477 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008478 space is available. Return a new reference to the object that
8479 was put in the output buffer, or Py_None, if the mapping was undefined
8480 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008481 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008482static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008483charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008484 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008485{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008486 PyObject *rep;
8487 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008488 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008489
Andy Lesterdffe4c02020-03-04 07:15:20 -06008490 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008491 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008493 if (res == -1)
8494 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008495 if (outsize<requiredsize)
8496 if (charmapencode_resize(outobj, outpos, requiredsize))
8497 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008498 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 outstart[(*outpos)++] = (char)res;
8500 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008501 }
8502
8503 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008504 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008506 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008507 Py_DECREF(rep);
8508 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008509 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 if (PyLong_Check(rep)) {
8511 Py_ssize_t requiredsize = *outpos+1;
8512 if (outsize<requiredsize)
8513 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8514 Py_DECREF(rep);
8515 return enc_EXCEPTION;
8516 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008517 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008519 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 else {
8521 const char *repchars = PyBytes_AS_STRING(rep);
8522 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8523 Py_ssize_t requiredsize = *outpos+repsize;
8524 if (outsize<requiredsize)
8525 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8526 Py_DECREF(rep);
8527 return enc_EXCEPTION;
8528 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008529 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008530 memcpy(outstart + *outpos, repchars, repsize);
8531 *outpos += repsize;
8532 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008533 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008534 Py_DECREF(rep);
8535 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008536}
8537
8538/* handle an error in PyUnicode_EncodeCharmap
8539 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008540static int
8541charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008542 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008543 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008544 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008545 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008546{
8547 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008548 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008549 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008550 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008551 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008552 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008553 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008554 Py_ssize_t collstartpos = *inpos;
8555 Py_ssize_t collendpos = *inpos+1;
8556 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008557 const char *encoding = "charmap";
8558 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008559 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008560 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008561 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008562
Benjamin Petersonbac79492012-01-14 13:34:47 -05008563 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008564 return -1;
8565 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008566 /* find all unencodable characters */
8567 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008568 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008569 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008570 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008571 val = encoding_map_lookup(ch, mapping);
8572 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 break;
8574 ++collendpos;
8575 continue;
8576 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008577
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008578 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8579 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008580 if (rep==NULL)
8581 return -1;
8582 else if (rep!=Py_None) {
8583 Py_DECREF(rep);
8584 break;
8585 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008586 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008587 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008588 }
8589 /* cache callback name lookup
8590 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008591 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008592 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008593
8594 switch (*error_handler) {
8595 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008596 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008597 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008598
8599 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008600 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 x = charmapencode_output('?', mapping, res, respos);
8602 if (x==enc_EXCEPTION) {
8603 return -1;
8604 }
8605 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008606 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 return -1;
8608 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008609 }
8610 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008611 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008612 *inpos = collendpos;
8613 break;
Victor Stinner50149202015-09-22 00:26:54 +02008614
8615 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008616 /* generate replacement (temporarily (mis)uses p) */
8617 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008618 char buffer[2+29+1+1];
8619 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008620 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008621 for (cp = buffer; *cp; ++cp) {
8622 x = charmapencode_output(*cp, mapping, res, respos);
8623 if (x==enc_EXCEPTION)
8624 return -1;
8625 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008626 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008627 return -1;
8628 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008629 }
8630 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008631 *inpos = collendpos;
8632 break;
Victor Stinner50149202015-09-22 00:26:54 +02008633
Benjamin Peterson14339b62009-01-31 16:36:08 +00008634 default:
Victor Stinner50149202015-09-22 00:26:54 +02008635 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008636 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008637 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008638 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008639 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008640 if (PyBytes_Check(repunicode)) {
8641 /* Directly copy bytes result to output. */
8642 Py_ssize_t outsize = PyBytes_Size(*res);
8643 Py_ssize_t requiredsize;
8644 repsize = PyBytes_Size(repunicode);
8645 requiredsize = *respos + repsize;
8646 if (requiredsize > outsize)
8647 /* Make room for all additional bytes. */
8648 if (charmapencode_resize(res, respos, requiredsize)) {
8649 Py_DECREF(repunicode);
8650 return -1;
8651 }
8652 memcpy(PyBytes_AsString(*res) + *respos,
8653 PyBytes_AsString(repunicode), repsize);
8654 *respos += repsize;
8655 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008656 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008657 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008658 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008659 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008660 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008661 Py_DECREF(repunicode);
8662 return -1;
8663 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008664 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008665 data = PyUnicode_DATA(repunicode);
8666 kind = PyUnicode_KIND(repunicode);
8667 for (index = 0; index < repsize; index++) {
8668 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8669 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008670 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008671 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 return -1;
8673 }
8674 else if (x==enc_FAILED) {
8675 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008676 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 return -1;
8678 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008679 }
8680 *inpos = newpos;
8681 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008682 }
8683 return 0;
8684}
8685
Alexander Belopolsky40018472011-02-26 01:02:56 +00008686PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008687_PyUnicode_EncodeCharmap(PyObject *unicode,
8688 PyObject *mapping,
8689 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008690{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008691 /* output object */
8692 PyObject *res = NULL;
8693 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008694 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008695 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008696 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008697 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008698 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008699 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008700 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008701 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008702 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008703
Benjamin Petersonbac79492012-01-14 13:34:47 -05008704 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008705 return NULL;
8706 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008707 data = PyUnicode_DATA(unicode);
8708 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008709
Guido van Rossumd57fd912000-03-10 22:53:23 +00008710 /* Default to Latin-1 */
8711 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008712 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008713
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008714 /* allocate enough for a simple encoding without
8715 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008716 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008717 if (res == NULL)
8718 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008719 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008720 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008721
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008722 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008723 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008724 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008725 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 if (x==enc_EXCEPTION) /* error */
8727 goto onError;
8728 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008729 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008730 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008731 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008732 &res, &respos)) {
8733 goto onError;
8734 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008735 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008736 else
8737 /* done with this character => adjust input position */
8738 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008739 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008740
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008741 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008742 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008743 if (_PyBytes_Resize(&res, respos) < 0)
8744 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008745
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008746 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008747 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008748 return res;
8749
Benjamin Peterson29060642009-01-31 22:14:21 +00008750 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008751 Py_XDECREF(res);
8752 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008753 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008754 return NULL;
8755}
8756
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008757/* Deprecated */
8758PyObject *
8759PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8760 Py_ssize_t size,
8761 PyObject *mapping,
8762 const char *errors)
8763{
8764 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008765 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008766 if (unicode == NULL)
8767 return NULL;
8768 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8769 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008770 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008771}
8772
Alexander Belopolsky40018472011-02-26 01:02:56 +00008773PyObject *
8774PyUnicode_AsCharmapString(PyObject *unicode,
8775 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008776{
8777 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008778 PyErr_BadArgument();
8779 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008780 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008781 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782}
8783
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008784/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008785static void
8786make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008787 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008788 Py_ssize_t startpos, Py_ssize_t endpos,
8789 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008790{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008791 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008792 *exceptionObject = _PyUnicodeTranslateError_Create(
8793 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008794 }
8795 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008796 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8797 goto onError;
8798 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8799 goto onError;
8800 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8801 goto onError;
8802 return;
8803 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008804 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008805 }
8806}
8807
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008808/* error handling callback helper:
8809 build arguments, call the callback and check the arguments,
8810 put the result into newpos and return the replacement string, which
8811 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008812static PyObject *
8813unicode_translate_call_errorhandler(const char *errors,
8814 PyObject **errorHandler,
8815 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008816 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008817 Py_ssize_t startpos, Py_ssize_t endpos,
8818 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008819{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008820 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008821
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008822 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008823 PyObject *restuple;
8824 PyObject *resunicode;
8825
8826 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008827 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008828 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008829 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008830 }
8831
8832 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008833 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008834 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008835 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008836
Petr Viktorinffd97532020-02-11 17:46:57 +01008837 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008838 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008839 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008840 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008841 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008842 Py_DECREF(restuple);
8843 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008844 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008845 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008846 &resunicode, &i_newpos)) {
8847 Py_DECREF(restuple);
8848 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008849 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008850 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008851 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008852 else
8853 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008854 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008855 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008856 Py_DECREF(restuple);
8857 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008858 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008859 Py_INCREF(resunicode);
8860 Py_DECREF(restuple);
8861 return resunicode;
8862}
8863
8864/* Lookup the character ch in the mapping and put the result in result,
8865 which must be decrefed by the caller.
8866 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008867static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008868charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008869{
Christian Heimes217cfd12007-12-02 14:31:20 +00008870 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008871 PyObject *x;
8872
8873 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008874 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008875 x = PyObject_GetItem(mapping, w);
8876 Py_DECREF(w);
8877 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008878 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8879 /* No mapping found means: use 1:1 mapping. */
8880 PyErr_Clear();
8881 *result = NULL;
8882 return 0;
8883 } else
8884 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008885 }
8886 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008887 *result = x;
8888 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008889 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008890 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008891 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008892 if (value < 0 || value > MAX_UNICODE) {
8893 PyErr_Format(PyExc_ValueError,
8894 "character mapping must be in range(0x%x)",
8895 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008896 Py_DECREF(x);
8897 return -1;
8898 }
8899 *result = x;
8900 return 0;
8901 }
8902 else if (PyUnicode_Check(x)) {
8903 *result = x;
8904 return 0;
8905 }
8906 else {
8907 /* wrong return value */
8908 PyErr_SetString(PyExc_TypeError,
8909 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008910 Py_DECREF(x);
8911 return -1;
8912 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008913}
Victor Stinner1194ea02014-04-04 19:37:40 +02008914
8915/* lookup the character, write the result into the writer.
8916 Return 1 if the result was written into the writer, return 0 if the mapping
8917 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008918static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008919charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8920 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008921{
Victor Stinner1194ea02014-04-04 19:37:40 +02008922 PyObject *item;
8923
8924 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008925 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008926
8927 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008928 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008929 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008930 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008931 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008932 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008933 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008934
8935 if (item == Py_None) {
8936 Py_DECREF(item);
8937 return 0;
8938 }
8939
8940 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008941 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8942 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8943 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008944 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8945 Py_DECREF(item);
8946 return -1;
8947 }
8948 Py_DECREF(item);
8949 return 1;
8950 }
8951
8952 if (!PyUnicode_Check(item)) {
8953 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008954 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008955 }
8956
8957 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8958 Py_DECREF(item);
8959 return -1;
8960 }
8961
8962 Py_DECREF(item);
8963 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008964}
8965
Victor Stinner89a76ab2014-04-05 11:44:04 +02008966static int
8967unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8968 Py_UCS1 *translate)
8969{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008970 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008971 int ret = 0;
8972
Victor Stinner89a76ab2014-04-05 11:44:04 +02008973 if (charmaptranslate_lookup(ch, mapping, &item)) {
8974 return -1;
8975 }
8976
8977 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008978 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008979 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008980 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008981 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008982 /* not found => default to 1:1 mapping */
8983 translate[ch] = ch;
8984 return 1;
8985 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008986 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008987 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008988 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8989 used it */
8990 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008991 /* invalid character or character outside ASCII:
8992 skip the fast translate */
8993 goto exit;
8994 }
8995 translate[ch] = (Py_UCS1)replace;
8996 }
8997 else if (PyUnicode_Check(item)) {
8998 Py_UCS4 replace;
8999
9000 if (PyUnicode_READY(item) == -1) {
9001 Py_DECREF(item);
9002 return -1;
9003 }
9004 if (PyUnicode_GET_LENGTH(item) != 1)
9005 goto exit;
9006
9007 replace = PyUnicode_READ_CHAR(item, 0);
9008 if (replace > 127)
9009 goto exit;
9010 translate[ch] = (Py_UCS1)replace;
9011 }
9012 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009013 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009014 goto exit;
9015 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009016 ret = 1;
9017
Benjamin Peterson1365de72014-04-07 20:15:41 -04009018 exit:
9019 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009020 return ret;
9021}
9022
9023/* Fast path for ascii => ascii translation. Return 1 if the whole string
9024 was translated into writer, return 0 if the input string was partially
9025 translated into writer, raise an exception and return -1 on error. */
9026static int
9027unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009028 _PyUnicodeWriter *writer, int ignore,
9029 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009030{
Victor Stinner872b2912014-04-05 14:27:07 +02009031 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009032 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009033 const Py_UCS1 *in, *end;
9034 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009035 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009036
Victor Stinner89a76ab2014-04-05 11:44:04 +02009037 len = PyUnicode_GET_LENGTH(input);
9038
Victor Stinner872b2912014-04-05 14:27:07 +02009039 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009040
9041 in = PyUnicode_1BYTE_DATA(input);
9042 end = in + len;
9043
9044 assert(PyUnicode_IS_ASCII(writer->buffer));
9045 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9046 out = PyUnicode_1BYTE_DATA(writer->buffer);
9047
Victor Stinner872b2912014-04-05 14:27:07 +02009048 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009049 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009050 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009051 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009052 int translate = unicode_fast_translate_lookup(mapping, ch,
9053 ascii_table);
9054 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009055 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009056 if (translate == 0)
9057 goto exit;
9058 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009059 }
Victor Stinner872b2912014-04-05 14:27:07 +02009060 if (ch2 == 0xfe) {
9061 if (ignore)
9062 continue;
9063 goto exit;
9064 }
9065 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009066 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009067 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009068 }
Victor Stinner872b2912014-04-05 14:27:07 +02009069 res = 1;
9070
9071exit:
9072 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009073 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009074 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009075}
9076
Victor Stinner3222da22015-10-01 22:07:32 +02009077static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009078_PyUnicode_TranslateCharmap(PyObject *input,
9079 PyObject *mapping,
9080 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009082 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009083 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009084 Py_ssize_t size, i;
9085 int kind;
9086 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009087 _PyUnicodeWriter writer;
9088 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009089 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009090 PyObject *errorHandler = NULL;
9091 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009092 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009093 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009094
Guido van Rossumd57fd912000-03-10 22:53:23 +00009095 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009096 PyErr_BadArgument();
9097 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009100 if (PyUnicode_READY(input) == -1)
9101 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009102 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009103 kind = PyUnicode_KIND(input);
9104 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009105
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009106 if (size == 0)
9107 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009108
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009109 /* allocate enough for a simple 1:1 translation without
9110 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009111 _PyUnicodeWriter_Init(&writer);
9112 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009113 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009114
Victor Stinner872b2912014-04-05 14:27:07 +02009115 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9116
Victor Stinner33798672016-03-01 21:59:58 +01009117 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009118 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009119 if (PyUnicode_IS_ASCII(input)) {
9120 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9121 if (res < 0) {
9122 _PyUnicodeWriter_Dealloc(&writer);
9123 return NULL;
9124 }
9125 if (res == 1)
9126 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009127 }
Victor Stinner33798672016-03-01 21:59:58 +01009128 else {
9129 i = 0;
9130 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009132 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009133 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009134 int translate;
9135 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9136 Py_ssize_t newpos;
9137 /* startpos for collecting untranslatable chars */
9138 Py_ssize_t collstart;
9139 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009140 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009141
Victor Stinner1194ea02014-04-04 19:37:40 +02009142 ch = PyUnicode_READ(kind, data, i);
9143 translate = charmaptranslate_output(ch, mapping, &writer);
9144 if (translate < 0)
9145 goto onError;
9146
9147 if (translate != 0) {
9148 /* it worked => adjust input pointer */
9149 ++i;
9150 continue;
9151 }
9152
9153 /* untranslatable character */
9154 collstart = i;
9155 collend = i+1;
9156
9157 /* find all untranslatable characters */
9158 while (collend < size) {
9159 PyObject *x;
9160 ch = PyUnicode_READ(kind, data, collend);
9161 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009162 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009163 Py_XDECREF(x);
9164 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009165 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009166 ++collend;
9167 }
9168
9169 if (ignore) {
9170 i = collend;
9171 }
9172 else {
9173 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9174 reason, input, &exc,
9175 collstart, collend, &newpos);
9176 if (repunicode == NULL)
9177 goto onError;
9178 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009179 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009180 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009181 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009182 Py_DECREF(repunicode);
9183 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009184 }
9185 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009186 Py_XDECREF(exc);
9187 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009188 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009189
Benjamin Peterson29060642009-01-31 22:14:21 +00009190 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009191 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009192 Py_XDECREF(exc);
9193 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009194 return NULL;
9195}
9196
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009197/* Deprecated. Use PyUnicode_Translate instead. */
9198PyObject *
9199PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9200 Py_ssize_t size,
9201 PyObject *mapping,
9202 const char *errors)
9203{
Christian Heimes5f520f42012-09-11 14:03:25 +02009204 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009205 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009206 if (!unicode)
9207 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009208 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9209 Py_DECREF(unicode);
9210 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009211}
9212
Alexander Belopolsky40018472011-02-26 01:02:56 +00009213PyObject *
9214PyUnicode_Translate(PyObject *str,
9215 PyObject *mapping,
9216 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009217{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009218 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009219 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009220 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009221}
Tim Petersced69f82003-09-16 20:30:58 +00009222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009223PyObject *
9224_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9225{
9226 if (!PyUnicode_Check(unicode)) {
9227 PyErr_BadInternalCall();
9228 return NULL;
9229 }
9230 if (PyUnicode_READY(unicode) == -1)
9231 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009232 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009233 /* If the string is already ASCII, just return the same string */
9234 Py_INCREF(unicode);
9235 return unicode;
9236 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009237
9238 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9239 PyObject *result = PyUnicode_New(len, 127);
9240 if (result == NULL) {
9241 return NULL;
9242 }
9243
9244 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9245 int kind = PyUnicode_KIND(unicode);
9246 const void *data = PyUnicode_DATA(unicode);
9247 Py_ssize_t i;
9248 for (i = 0; i < len; ++i) {
9249 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9250 if (ch < 127) {
9251 out[i] = ch;
9252 }
9253 else if (Py_UNICODE_ISSPACE(ch)) {
9254 out[i] = ' ';
9255 }
9256 else {
9257 int decimal = Py_UNICODE_TODECIMAL(ch);
9258 if (decimal < 0) {
9259 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009260 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009261 _PyUnicode_LENGTH(result) = i + 1;
9262 break;
9263 }
9264 out[i] = '0' + decimal;
9265 }
9266 }
9267
INADA Naoki16dfca42018-07-14 12:06:43 +09009268 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009269 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009270}
9271
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009272PyObject *
9273PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9274 Py_ssize_t length)
9275{
Victor Stinnerf0124502011-11-21 23:12:56 +01009276 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009277 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009278 Py_UCS4 maxchar;
9279 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009280 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009281
Victor Stinner99d7ad02012-02-22 13:37:39 +01009282 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009283 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009284 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009285 if (ch > 127) {
9286 int decimal = Py_UNICODE_TODECIMAL(ch);
9287 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009288 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009289 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009290 }
9291 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009292
9293 /* Copy to a new string */
9294 decimal = PyUnicode_New(length, maxchar);
9295 if (decimal == NULL)
9296 return decimal;
9297 kind = PyUnicode_KIND(decimal);
9298 data = PyUnicode_DATA(decimal);
9299 /* Iterate over code points */
9300 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009301 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009302 if (ch > 127) {
9303 int decimal = Py_UNICODE_TODECIMAL(ch);
9304 if (decimal >= 0)
9305 ch = '0' + decimal;
9306 }
9307 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009308 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009309 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009310}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009311/* --- Decimal Encoder ---------------------------------------------------- */
9312
Alexander Belopolsky40018472011-02-26 01:02:56 +00009313int
9314PyUnicode_EncodeDecimal(Py_UNICODE *s,
9315 Py_ssize_t length,
9316 char *output,
9317 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009318{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009319 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009320 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009321 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009322 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009323
9324 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009325 PyErr_BadArgument();
9326 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009327 }
9328
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009329 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009330 if (unicode == NULL)
9331 return -1;
9332
Victor Stinner42bf7752011-11-21 22:52:58 +01009333 kind = PyUnicode_KIND(unicode);
9334 data = PyUnicode_DATA(unicode);
9335
Victor Stinnerb84d7232011-11-22 01:50:07 +01009336 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009337 PyObject *exc;
9338 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009339 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009340 Py_ssize_t startpos;
9341
9342 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009343
Benjamin Peterson29060642009-01-31 22:14:21 +00009344 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009345 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009346 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009347 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009348 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009349 decimal = Py_UNICODE_TODECIMAL(ch);
9350 if (decimal >= 0) {
9351 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009352 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009353 continue;
9354 }
9355 if (0 < ch && ch < 256) {
9356 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009357 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009358 continue;
9359 }
Victor Stinner6345be92011-11-25 20:09:01 +01009360
Victor Stinner42bf7752011-11-21 22:52:58 +01009361 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009362 exc = NULL;
9363 raise_encode_exception(&exc, "decimal", unicode,
9364 startpos, startpos+1,
9365 "invalid decimal Unicode string");
9366 Py_XDECREF(exc);
9367 Py_DECREF(unicode);
9368 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009369 }
9370 /* 0-terminate the output string */
9371 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009372 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009373 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009374}
9375
Guido van Rossumd57fd912000-03-10 22:53:23 +00009376/* --- Helpers ------------------------------------------------------------ */
9377
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009378/* helper macro to fixup start/end slice values */
9379#define ADJUST_INDICES(start, end, len) \
9380 if (end > len) \
9381 end = len; \
9382 else if (end < 0) { \
9383 end += len; \
9384 if (end < 0) \
9385 end = 0; \
9386 } \
9387 if (start < 0) { \
9388 start += len; \
9389 if (start < 0) \
9390 start = 0; \
9391 }
9392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009393static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009394any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009395 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009396 Py_ssize_t end,
9397 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009398{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009399 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009400 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009401 Py_ssize_t len1, len2, result;
9402
9403 kind1 = PyUnicode_KIND(s1);
9404 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009405 if (kind1 < kind2)
9406 return -1;
9407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009408 len1 = PyUnicode_GET_LENGTH(s1);
9409 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009410 ADJUST_INDICES(start, end, len1);
9411 if (end - start < len2)
9412 return -1;
9413
9414 buf1 = PyUnicode_DATA(s1);
9415 buf2 = PyUnicode_DATA(s2);
9416 if (len2 == 1) {
9417 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9418 result = findchar((const char *)buf1 + kind1*start,
9419 kind1, end - start, ch, direction);
9420 if (result == -1)
9421 return -1;
9422 else
9423 return start + result;
9424 }
9425
9426 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009427 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009428 if (!buf2)
9429 return -2;
9430 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009431
Victor Stinner794d5672011-10-10 03:21:36 +02009432 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009433 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009434 case PyUnicode_1BYTE_KIND:
9435 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9436 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9437 else
9438 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9439 break;
9440 case PyUnicode_2BYTE_KIND:
9441 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9442 break;
9443 case PyUnicode_4BYTE_KIND:
9444 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9445 break;
9446 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009447 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009448 }
9449 }
9450 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009451 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009452 case PyUnicode_1BYTE_KIND:
9453 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9454 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9455 else
9456 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9457 break;
9458 case PyUnicode_2BYTE_KIND:
9459 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9460 break;
9461 case PyUnicode_4BYTE_KIND:
9462 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9463 break;
9464 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009465 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009466 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467 }
9468
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009469 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009470 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009471 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472
9473 return result;
9474}
9475
Victor Stinner59423e32018-11-26 13:40:01 +01009476/* _PyUnicode_InsertThousandsGrouping() helper functions */
9477#include "stringlib/localeutil.h"
9478
9479/**
9480 * InsertThousandsGrouping:
9481 * @writer: Unicode writer.
9482 * @n_buffer: Number of characters in @buffer.
9483 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9484 * @d_pos: Start of digits string.
9485 * @n_digits: The number of digits in the string, in which we want
9486 * to put the grouping chars.
9487 * @min_width: The minimum width of the digits in the output string.
9488 * Output will be zero-padded on the left to fill.
9489 * @grouping: see definition in localeconv().
9490 * @thousands_sep: see definition in localeconv().
9491 *
9492 * There are 2 modes: counting and filling. If @writer is NULL,
9493 * we are in counting mode, else filling mode.
9494 * If counting, the required buffer size is returned.
9495 * If filling, we know the buffer will be large enough, so we don't
9496 * need to pass in the buffer size.
9497 * Inserts thousand grouping characters (as defined by grouping and
9498 * thousands_sep) into @writer.
9499 *
9500 * Return value: -1 on error, number of characters otherwise.
9501 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009503_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009504 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009505 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009506 PyObject *digits,
9507 Py_ssize_t d_pos,
9508 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009509 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009510 const char *grouping,
9511 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009512 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009513{
Xtreak3f7983a2019-01-07 20:39:14 +05309514 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009515 if (writer) {
9516 assert(digits != NULL);
9517 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009518 }
9519 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009520 assert(digits == NULL);
9521 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009522 }
Victor Stinner59423e32018-11-26 13:40:01 +01009523 assert(0 <= d_pos);
9524 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009525 assert(grouping != NULL);
9526
9527 if (digits != NULL) {
9528 if (PyUnicode_READY(digits) == -1) {
9529 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009530 }
Victor Stinner59423e32018-11-26 13:40:01 +01009531 }
9532 if (PyUnicode_READY(thousands_sep) == -1) {
9533 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009534 }
9535
Victor Stinner59423e32018-11-26 13:40:01 +01009536 Py_ssize_t count = 0;
9537 Py_ssize_t n_zeros;
9538 int loop_broken = 0;
9539 int use_separator = 0; /* First time through, don't append the
9540 separator. They only go between
9541 groups. */
9542 Py_ssize_t buffer_pos;
9543 Py_ssize_t digits_pos;
9544 Py_ssize_t len;
9545 Py_ssize_t n_chars;
9546 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9547 be looked at */
9548 /* A generator that returns all of the grouping widths, until it
9549 returns 0. */
9550 GroupGenerator groupgen;
9551 GroupGenerator_init(&groupgen, grouping);
9552 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9553
9554 /* if digits are not grouped, thousands separator
9555 should be an empty string */
9556 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9557
9558 digits_pos = d_pos + n_digits;
9559 if (writer) {
9560 buffer_pos = writer->pos + n_buffer;
9561 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9562 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009563 }
Victor Stinner59423e32018-11-26 13:40:01 +01009564 else {
9565 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009566 }
Victor Stinner59423e32018-11-26 13:40:01 +01009567
9568 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009569 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009570 }
Victor Stinner59423e32018-11-26 13:40:01 +01009571
9572 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9573 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9574 n_zeros = Py_MAX(0, len - remaining);
9575 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9576
9577 /* Use n_zero zero's and n_chars chars */
9578
9579 /* Count only, don't do anything. */
9580 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9581
9582 /* Copy into the writer. */
9583 InsertThousandsGrouping_fill(writer, &buffer_pos,
9584 digits, &digits_pos,
9585 n_chars, n_zeros,
9586 use_separator ? thousands_sep : NULL,
9587 thousands_sep_len, maxchar);
9588
9589 /* Use a separator next time. */
9590 use_separator = 1;
9591
9592 remaining -= n_chars;
9593 min_width -= len;
9594
9595 if (remaining <= 0 && min_width <= 0) {
9596 loop_broken = 1;
9597 break;
9598 }
9599 min_width -= thousands_sep_len;
9600 }
9601 if (!loop_broken) {
9602 /* We left the loop without using a break statement. */
9603
9604 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9605 n_zeros = Py_MAX(0, len - remaining);
9606 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9607
9608 /* Use n_zero zero's and n_chars chars */
9609 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9610
9611 /* Copy into the writer. */
9612 InsertThousandsGrouping_fill(writer, &buffer_pos,
9613 digits, &digits_pos,
9614 n_chars, n_zeros,
9615 use_separator ? thousands_sep : NULL,
9616 thousands_sep_len, maxchar);
9617 }
9618 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009619}
9620
9621
Alexander Belopolsky40018472011-02-26 01:02:56 +00009622Py_ssize_t
9623PyUnicode_Count(PyObject *str,
9624 PyObject *substr,
9625 Py_ssize_t start,
9626 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009627{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009628 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009629 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009630 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009631 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009632
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009633 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009634 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009635
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009636 kind1 = PyUnicode_KIND(str);
9637 kind2 = PyUnicode_KIND(substr);
9638 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009639 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009640
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009641 len1 = PyUnicode_GET_LENGTH(str);
9642 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009643 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009644 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009645 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009646
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009647 buf1 = PyUnicode_DATA(str);
9648 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009649 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009650 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009651 if (!buf2)
9652 goto onError;
9653 }
9654
9655 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009656 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009657 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009658 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009659 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009660 buf2, len2, PY_SSIZE_T_MAX
9661 );
9662 else
9663 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009664 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009665 buf2, len2, PY_SSIZE_T_MAX
9666 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009667 break;
9668 case PyUnicode_2BYTE_KIND:
9669 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009670 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009671 buf2, len2, PY_SSIZE_T_MAX
9672 );
9673 break;
9674 case PyUnicode_4BYTE_KIND:
9675 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009676 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009677 buf2, len2, PY_SSIZE_T_MAX
9678 );
9679 break;
9680 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009681 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009682 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009683
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009684 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009685 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009686 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009687
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009689 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009690 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9691 if (kind2 != kind1)
9692 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009693 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009694}
9695
Alexander Belopolsky40018472011-02-26 01:02:56 +00009696Py_ssize_t
9697PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009698 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009699 Py_ssize_t start,
9700 Py_ssize_t end,
9701 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009702{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009703 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009704 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009705
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009706 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009707}
9708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009709Py_ssize_t
9710PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9711 Py_ssize_t start, Py_ssize_t end,
9712 int direction)
9713{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009714 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009715 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009716 if (PyUnicode_READY(str) == -1)
9717 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009718 len = PyUnicode_GET_LENGTH(str);
9719 ADJUST_INDICES(start, end, len);
9720 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009721 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009722 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009723 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9724 kind, end-start, ch, direction);
9725 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009726 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009727 else
9728 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009729}
9730
Alexander Belopolsky40018472011-02-26 01:02:56 +00009731static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009732tailmatch(PyObject *self,
9733 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009734 Py_ssize_t start,
9735 Py_ssize_t end,
9736 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009737{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009738 int kind_self;
9739 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009740 const void *data_self;
9741 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009742 Py_ssize_t offset;
9743 Py_ssize_t i;
9744 Py_ssize_t end_sub;
9745
9746 if (PyUnicode_READY(self) == -1 ||
9747 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009748 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009750 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9751 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009752 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009753 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009754
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009755 if (PyUnicode_GET_LENGTH(substring) == 0)
9756 return 1;
9757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009758 kind_self = PyUnicode_KIND(self);
9759 data_self = PyUnicode_DATA(self);
9760 kind_sub = PyUnicode_KIND(substring);
9761 data_sub = PyUnicode_DATA(substring);
9762 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9763
9764 if (direction > 0)
9765 offset = end;
9766 else
9767 offset = start;
9768
9769 if (PyUnicode_READ(kind_self, data_self, offset) ==
9770 PyUnicode_READ(kind_sub, data_sub, 0) &&
9771 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9772 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9773 /* If both are of the same kind, memcmp is sufficient */
9774 if (kind_self == kind_sub) {
9775 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009776 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009777 data_sub,
9778 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009779 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009781 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009782 else {
9783 /* We do not need to compare 0 and len(substring)-1 because
9784 the if statement above ensured already that they are equal
9785 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009786 for (i = 1; i < end_sub; ++i) {
9787 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9788 PyUnicode_READ(kind_sub, data_sub, i))
9789 return 0;
9790 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009791 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009792 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009793 }
9794
9795 return 0;
9796}
9797
Alexander Belopolsky40018472011-02-26 01:02:56 +00009798Py_ssize_t
9799PyUnicode_Tailmatch(PyObject *str,
9800 PyObject *substr,
9801 Py_ssize_t start,
9802 Py_ssize_t end,
9803 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009804{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009805 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009806 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009807
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009808 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009809}
9810
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009811static PyObject *
9812ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009813{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009814 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009815 const char *data = PyUnicode_DATA(self);
9816 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009817 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009818
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009819 res = PyUnicode_New(len, 127);
9820 if (res == NULL)
9821 return NULL;
9822 resdata = PyUnicode_DATA(res);
9823 if (lower)
9824 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009825 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009826 _Py_bytes_upper(resdata, data, len);
9827 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009828}
9829
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009830static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009831handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009832{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009833 Py_ssize_t j;
9834 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009835 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009836 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009837
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009838 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9839
9840 where ! is a negation and \p{xxx} is a character with property xxx.
9841 */
9842 for (j = i - 1; j >= 0; j--) {
9843 c = PyUnicode_READ(kind, data, j);
9844 if (!_PyUnicode_IsCaseIgnorable(c))
9845 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009846 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009847 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9848 if (final_sigma) {
9849 for (j = i + 1; j < length; j++) {
9850 c = PyUnicode_READ(kind, data, j);
9851 if (!_PyUnicode_IsCaseIgnorable(c))
9852 break;
9853 }
9854 final_sigma = j == length || !_PyUnicode_IsCased(c);
9855 }
9856 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009857}
9858
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009859static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009860lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009861 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009862{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009863 /* Obscure special case. */
9864 if (c == 0x3A3) {
9865 mapped[0] = handle_capital_sigma(kind, data, length, i);
9866 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009867 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009868 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009869}
9870
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009871static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009872do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009873{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009874 Py_ssize_t i, k = 0;
9875 int n_res, j;
9876 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009877
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009878 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009879 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009880 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009881 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009882 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009883 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009884 for (i = 1; i < length; i++) {
9885 c = PyUnicode_READ(kind, data, i);
9886 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9887 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009888 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009889 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009890 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009891 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009892 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009893}
9894
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009895static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009896do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009897 Py_ssize_t i, k = 0;
9898
9899 for (i = 0; i < length; i++) {
9900 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9901 int n_res, j;
9902 if (Py_UNICODE_ISUPPER(c)) {
9903 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9904 }
9905 else if (Py_UNICODE_ISLOWER(c)) {
9906 n_res = _PyUnicode_ToUpperFull(c, mapped);
9907 }
9908 else {
9909 n_res = 1;
9910 mapped[0] = c;
9911 }
9912 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009913 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009914 res[k++] = mapped[j];
9915 }
9916 }
9917 return k;
9918}
9919
9920static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009921do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009922 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009923{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009924 Py_ssize_t i, k = 0;
9925
9926 for (i = 0; i < length; i++) {
9927 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9928 int n_res, j;
9929 if (lower)
9930 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9931 else
9932 n_res = _PyUnicode_ToUpperFull(c, mapped);
9933 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009934 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009935 res[k++] = mapped[j];
9936 }
9937 }
9938 return k;
9939}
9940
9941static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009942do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009943{
9944 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9945}
9946
9947static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009948do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009949{
9950 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9951}
9952
Benjamin Petersone51757f2012-01-12 21:10:29 -05009953static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009954do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -05009955{
9956 Py_ssize_t i, k = 0;
9957
9958 for (i = 0; i < length; i++) {
9959 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9960 Py_UCS4 mapped[3];
9961 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9962 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009963 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009964 res[k++] = mapped[j];
9965 }
9966 }
9967 return k;
9968}
9969
9970static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009971do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -05009972{
9973 Py_ssize_t i, k = 0;
9974 int previous_is_cased;
9975
9976 previous_is_cased = 0;
9977 for (i = 0; i < length; i++) {
9978 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9979 Py_UCS4 mapped[3];
9980 int n_res, j;
9981
9982 if (previous_is_cased)
9983 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9984 else
9985 n_res = _PyUnicode_ToTitleFull(c, mapped);
9986
9987 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009988 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009989 res[k++] = mapped[j];
9990 }
9991
9992 previous_is_cased = _PyUnicode_IsCased(c);
9993 }
9994 return k;
9995}
9996
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009997static PyObject *
9998case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009999 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010000{
10001 PyObject *res = NULL;
10002 Py_ssize_t length, newlength = 0;
10003 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010004 const void *data;
10005 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010006 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10007
Benjamin Petersoneea48462012-01-16 14:28:50 -050010008 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010009
10010 kind = PyUnicode_KIND(self);
10011 data = PyUnicode_DATA(self);
10012 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010013 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010014 PyErr_SetString(PyExc_OverflowError, "string is too long");
10015 return NULL;
10016 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -040010017 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010018 if (tmp == NULL)
10019 return PyErr_NoMemory();
10020 newlength = perform(kind, data, length, tmp, &maxchar);
10021 res = PyUnicode_New(newlength, maxchar);
10022 if (res == NULL)
10023 goto leave;
10024 tmpend = tmp + newlength;
10025 outdata = PyUnicode_DATA(res);
10026 outkind = PyUnicode_KIND(res);
10027 switch (outkind) {
10028 case PyUnicode_1BYTE_KIND:
10029 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10030 break;
10031 case PyUnicode_2BYTE_KIND:
10032 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10033 break;
10034 case PyUnicode_4BYTE_KIND:
10035 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10036 break;
10037 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010038 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010039 }
10040 leave:
10041 PyMem_FREE(tmp);
10042 return res;
10043}
10044
Tim Peters8ce9f162004-08-27 01:49:32 +000010045PyObject *
10046PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010047{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010048 PyObject *res;
10049 PyObject *fseq;
10050 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010051 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010052
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010053 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010054 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010055 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010056 }
10057
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010058 /* NOTE: the following code can't call back into Python code,
10059 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010060 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010061
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010062 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010063 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010064 res = _PyUnicode_JoinArray(separator, items, seqlen);
10065 Py_DECREF(fseq);
10066 return res;
10067}
10068
10069PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010070_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010071{
10072 PyObject *res = NULL; /* the result */
10073 PyObject *sep = NULL;
10074 Py_ssize_t seplen;
10075 PyObject *item;
10076 Py_ssize_t sz, i, res_offset;
10077 Py_UCS4 maxchar;
10078 Py_UCS4 item_maxchar;
10079 int use_memcpy;
10080 unsigned char *res_data = NULL, *sep_data = NULL;
10081 PyObject *last_obj;
10082 unsigned int kind = 0;
10083
Tim Peters05eba1f2004-08-27 21:32:02 +000010084 /* If empty sequence, return u"". */
10085 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010086 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010087 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010088
Tim Peters05eba1f2004-08-27 21:32:02 +000010089 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010090 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010091 if (seqlen == 1) {
10092 if (PyUnicode_CheckExact(items[0])) {
10093 res = items[0];
10094 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010095 return res;
10096 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010097 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010098 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010099 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010100 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010101 /* Set up sep and seplen */
10102 if (separator == NULL) {
10103 /* fall back to a blank space separator */
10104 sep = PyUnicode_FromOrdinal(' ');
10105 if (!sep)
10106 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010107 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010108 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010109 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010110 else {
10111 if (!PyUnicode_Check(separator)) {
10112 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010113 "separator: expected str instance,"
10114 " %.80s found",
10115 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010116 goto onError;
10117 }
10118 if (PyUnicode_READY(separator))
10119 goto onError;
10120 sep = separator;
10121 seplen = PyUnicode_GET_LENGTH(separator);
10122 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10123 /* inc refcount to keep this code path symmetric with the
10124 above case of a blank separator */
10125 Py_INCREF(sep);
10126 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010127 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010128 }
10129
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010130 /* There are at least two things to join, or else we have a subclass
10131 * of str in the sequence.
10132 * Do a pre-pass to figure out the total amount of space we'll
10133 * need (sz), and see whether all argument are strings.
10134 */
10135 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010136#ifdef Py_DEBUG
10137 use_memcpy = 0;
10138#else
10139 use_memcpy = 1;
10140#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010141 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010142 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010143 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010144 if (!PyUnicode_Check(item)) {
10145 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010146 "sequence item %zd: expected str instance,"
10147 " %.80s found",
10148 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010149 goto onError;
10150 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010151 if (PyUnicode_READY(item) == -1)
10152 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010153 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010155 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010156 if (i != 0) {
10157 add_sz += seplen;
10158 }
10159 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010160 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010161 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010162 goto onError;
10163 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010164 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010165 if (use_memcpy && last_obj != NULL) {
10166 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10167 use_memcpy = 0;
10168 }
10169 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010170 }
Tim Petersced69f82003-09-16 20:30:58 +000010171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010173 if (res == NULL)
10174 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010175
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010176 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010177#ifdef Py_DEBUG
10178 use_memcpy = 0;
10179#else
10180 if (use_memcpy) {
10181 res_data = PyUnicode_1BYTE_DATA(res);
10182 kind = PyUnicode_KIND(res);
10183 if (seplen != 0)
10184 sep_data = PyUnicode_1BYTE_DATA(sep);
10185 }
10186#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010187 if (use_memcpy) {
10188 for (i = 0; i < seqlen; ++i) {
10189 Py_ssize_t itemlen;
10190 item = items[i];
10191
10192 /* Copy item, and maybe the separator. */
10193 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010194 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010195 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010196 kind * seplen);
10197 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010198 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010199
10200 itemlen = PyUnicode_GET_LENGTH(item);
10201 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010202 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010203 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010204 kind * itemlen);
10205 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010206 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010207 }
10208 assert(res_data == PyUnicode_1BYTE_DATA(res)
10209 + kind * PyUnicode_GET_LENGTH(res));
10210 }
10211 else {
10212 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10213 Py_ssize_t itemlen;
10214 item = items[i];
10215
10216 /* Copy item, and maybe the separator. */
10217 if (i && seplen != 0) {
10218 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10219 res_offset += seplen;
10220 }
10221
10222 itemlen = PyUnicode_GET_LENGTH(item);
10223 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010224 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010225 res_offset += itemlen;
10226 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010227 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010228 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010229 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010232 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010234
Benjamin Peterson29060642009-01-31 22:14:21 +000010235 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010236 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010237 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010238 return NULL;
10239}
10240
Victor Stinnerd3f08822012-05-29 12:57:52 +020010241void
10242_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10243 Py_UCS4 fill_char)
10244{
10245 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010246 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010247 assert(PyUnicode_IS_READY(unicode));
10248 assert(unicode_modifiable(unicode));
10249 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10250 assert(start >= 0);
10251 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010252 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010253}
10254
Victor Stinner3fe55312012-01-04 00:33:50 +010010255Py_ssize_t
10256PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10257 Py_UCS4 fill_char)
10258{
10259 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010260
10261 if (!PyUnicode_Check(unicode)) {
10262 PyErr_BadInternalCall();
10263 return -1;
10264 }
10265 if (PyUnicode_READY(unicode) == -1)
10266 return -1;
10267 if (unicode_check_modifiable(unicode))
10268 return -1;
10269
Victor Stinnerd3f08822012-05-29 12:57:52 +020010270 if (start < 0) {
10271 PyErr_SetString(PyExc_IndexError, "string index out of range");
10272 return -1;
10273 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010274 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10275 PyErr_SetString(PyExc_ValueError,
10276 "fill character is bigger than "
10277 "the string maximum character");
10278 return -1;
10279 }
10280
10281 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10282 length = Py_MIN(maxlen, length);
10283 if (length <= 0)
10284 return 0;
10285
Victor Stinnerd3f08822012-05-29 12:57:52 +020010286 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010287 return length;
10288}
10289
Victor Stinner9310abb2011-10-05 00:59:23 +020010290static PyObject *
10291pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010292 Py_ssize_t left,
10293 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010295{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 PyObject *u;
10297 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010298 int kind;
10299 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010300
10301 if (left < 0)
10302 left = 0;
10303 if (right < 0)
10304 right = 0;
10305
Victor Stinnerc4b49542011-12-11 22:44:26 +010010306 if (left == 0 && right == 0)
10307 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10310 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010311 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10312 return NULL;
10313 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010315 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010317 if (!u)
10318 return NULL;
10319
10320 kind = PyUnicode_KIND(u);
10321 data = PyUnicode_DATA(u);
10322 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010323 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010324 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010325 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010326 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010327 assert(_PyUnicode_CheckConsistency(u, 1));
10328 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010329}
10330
Alexander Belopolsky40018472011-02-26 01:02:56 +000010331PyObject *
10332PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010333{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010334 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010335
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010336 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010337 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010338
Benjamin Petersonead6b532011-12-20 17:23:42 -060010339 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010341 if (PyUnicode_IS_ASCII(string))
10342 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010343 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010344 PyUnicode_GET_LENGTH(string), keepends);
10345 else
10346 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010347 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010348 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010349 break;
10350 case PyUnicode_2BYTE_KIND:
10351 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010352 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 PyUnicode_GET_LENGTH(string), keepends);
10354 break;
10355 case PyUnicode_4BYTE_KIND:
10356 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010357 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010358 PyUnicode_GET_LENGTH(string), keepends);
10359 break;
10360 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010361 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010363 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010364}
10365
Alexander Belopolsky40018472011-02-26 01:02:56 +000010366static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010367split(PyObject *self,
10368 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010369 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010370{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010371 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010372 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010373 Py_ssize_t len1, len2;
10374 PyObject* out;
10375
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010377 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010378
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 if (PyUnicode_READY(self) == -1)
10380 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010383 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010385 if (PyUnicode_IS_ASCII(self))
10386 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010387 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010388 PyUnicode_GET_LENGTH(self), maxcount
10389 );
10390 else
10391 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010392 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010393 PyUnicode_GET_LENGTH(self), maxcount
10394 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395 case PyUnicode_2BYTE_KIND:
10396 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010397 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 PyUnicode_GET_LENGTH(self), maxcount
10399 );
10400 case PyUnicode_4BYTE_KIND:
10401 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010402 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010403 PyUnicode_GET_LENGTH(self), maxcount
10404 );
10405 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010406 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 }
10408
10409 if (PyUnicode_READY(substring) == -1)
10410 return NULL;
10411
10412 kind1 = PyUnicode_KIND(self);
10413 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010414 len1 = PyUnicode_GET_LENGTH(self);
10415 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010416 if (kind1 < kind2 || len1 < len2) {
10417 out = PyList_New(1);
10418 if (out == NULL)
10419 return NULL;
10420 Py_INCREF(self);
10421 PyList_SET_ITEM(out, 0, self);
10422 return out;
10423 }
10424 buf1 = PyUnicode_DATA(self);
10425 buf2 = PyUnicode_DATA(substring);
10426 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010427 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010428 if (!buf2)
10429 return NULL;
10430 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010432 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010434 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10435 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010436 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010437 else
10438 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010439 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010440 break;
10441 case PyUnicode_2BYTE_KIND:
10442 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010443 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010444 break;
10445 case PyUnicode_4BYTE_KIND:
10446 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010447 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010448 break;
10449 default:
10450 out = NULL;
10451 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010452 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010453 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010454 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010455 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010456}
10457
Alexander Belopolsky40018472011-02-26 01:02:56 +000010458static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010459rsplit(PyObject *self,
10460 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010461 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010462{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010463 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010464 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010465 Py_ssize_t len1, len2;
10466 PyObject* out;
10467
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010468 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010469 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471 if (PyUnicode_READY(self) == -1)
10472 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010475 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010477 if (PyUnicode_IS_ASCII(self))
10478 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010479 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010480 PyUnicode_GET_LENGTH(self), maxcount
10481 );
10482 else
10483 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010484 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010485 PyUnicode_GET_LENGTH(self), maxcount
10486 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 case PyUnicode_2BYTE_KIND:
10488 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010489 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490 PyUnicode_GET_LENGTH(self), maxcount
10491 );
10492 case PyUnicode_4BYTE_KIND:
10493 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010494 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 PyUnicode_GET_LENGTH(self), maxcount
10496 );
10497 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010498 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 }
10500
10501 if (PyUnicode_READY(substring) == -1)
10502 return NULL;
10503
10504 kind1 = PyUnicode_KIND(self);
10505 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 len1 = PyUnicode_GET_LENGTH(self);
10507 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010508 if (kind1 < kind2 || len1 < len2) {
10509 out = PyList_New(1);
10510 if (out == NULL)
10511 return NULL;
10512 Py_INCREF(self);
10513 PyList_SET_ITEM(out, 0, self);
10514 return out;
10515 }
10516 buf1 = PyUnicode_DATA(self);
10517 buf2 = PyUnicode_DATA(substring);
10518 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010519 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010520 if (!buf2)
10521 return NULL;
10522 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010524 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010526 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10527 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010528 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010529 else
10530 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010531 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010532 break;
10533 case PyUnicode_2BYTE_KIND:
10534 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010535 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 break;
10537 case PyUnicode_4BYTE_KIND:
10538 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010539 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 break;
10541 default:
10542 out = NULL;
10543 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010544 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010545 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010546 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 return out;
10548}
10549
10550static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010551anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10552 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010554 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010556 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10557 return asciilib_find(buf1, len1, buf2, len2, offset);
10558 else
10559 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560 case PyUnicode_2BYTE_KIND:
10561 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10562 case PyUnicode_4BYTE_KIND:
10563 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10564 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010565 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566}
10567
10568static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010569anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10570 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010571{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010572 switch (kind) {
10573 case PyUnicode_1BYTE_KIND:
10574 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10575 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10576 else
10577 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10578 case PyUnicode_2BYTE_KIND:
10579 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10580 case PyUnicode_4BYTE_KIND:
10581 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10582 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010583 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010584}
10585
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010586static void
10587replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10588 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10589{
10590 int kind = PyUnicode_KIND(u);
10591 void *data = PyUnicode_DATA(u);
10592 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10593 if (kind == PyUnicode_1BYTE_KIND) {
10594 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10595 (Py_UCS1 *)data + len,
10596 u1, u2, maxcount);
10597 }
10598 else if (kind == PyUnicode_2BYTE_KIND) {
10599 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10600 (Py_UCS2 *)data + len,
10601 u1, u2, maxcount);
10602 }
10603 else {
10604 assert(kind == PyUnicode_4BYTE_KIND);
10605 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10606 (Py_UCS4 *)data + len,
10607 u1, u2, maxcount);
10608 }
10609}
10610
Alexander Belopolsky40018472011-02-26 01:02:56 +000010611static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612replace(PyObject *self, PyObject *str1,
10613 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010614{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010616 const char *sbuf = PyUnicode_DATA(self);
10617 const void *buf1 = PyUnicode_DATA(str1);
10618 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 int srelease = 0, release1 = 0, release2 = 0;
10620 int skind = PyUnicode_KIND(self);
10621 int kind1 = PyUnicode_KIND(str1);
10622 int kind2 = PyUnicode_KIND(str2);
10623 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10624 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10625 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010626 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010627 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010628
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010629 if (slen < len1)
10630 goto nothing;
10631
Guido van Rossumd57fd912000-03-10 22:53:23 +000010632 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010633 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010634 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010635 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010636
Victor Stinner59de0ee2011-10-07 10:01:28 +020010637 if (str1 == str2)
10638 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639
Victor Stinner49a0a212011-10-12 23:46:10 +020010640 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010641 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10642 if (maxchar < maxchar_str1)
10643 /* substring too wide to be present */
10644 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010645 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10646 /* Replacing str1 with str2 may cause a maxchar reduction in the
10647 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010648 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010649 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010650
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010652 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010654 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010656 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010657 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010658 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010659
Victor Stinner69ed0f42013-04-09 21:48:24 +020010660 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010661 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010662 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010663 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010664 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010666 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010668
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010669 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10670 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010671 }
10672 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 int rkind = skind;
10674 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010675 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010676
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 if (kind1 < rkind) {
10678 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010679 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 if (!buf1) goto error;
10681 release1 = 1;
10682 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010683 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010684 if (i < 0)
10685 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010686 if (rkind > kind2) {
10687 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010688 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010689 if (!buf2) goto error;
10690 release2 = 1;
10691 }
10692 else if (rkind < kind2) {
10693 /* widen self and buf1 */
10694 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010695 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010696 assert(buf1 != PyUnicode_DATA(str1));
10697 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010698 buf1 = PyUnicode_DATA(str1);
10699 release1 = 0;
10700 }
10701 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010702 if (!sbuf) goto error;
10703 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010704 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010705 if (!buf1) goto error;
10706 release1 = 1;
10707 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010708 u = PyUnicode_New(slen, maxchar);
10709 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010710 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010711 assert(PyUnicode_KIND(u) == rkind);
10712 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010713
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010714 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010715 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010716 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010718 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010720
10721 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010722 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010723 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010724 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010725 if (i == -1)
10726 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010727 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010729 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010731 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010732 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010733 }
10734 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010735 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010736 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737 int rkind = skind;
10738 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010740 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010741 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010742 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010743 if (!buf1) goto error;
10744 release1 = 1;
10745 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010746 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010747 if (n == 0)
10748 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010749 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010750 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010751 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010752 if (!buf2) goto error;
10753 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010754 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010755 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010756 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010758 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010759 if (!sbuf) goto error;
10760 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010761 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010762 assert(buf1 != PyUnicode_DATA(str1));
10763 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010764 buf1 = PyUnicode_DATA(str1);
10765 release1 = 0;
10766 }
10767 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010768 if (!buf1) goto error;
10769 release1 = 1;
10770 }
10771 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10772 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010773 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010774 PyErr_SetString(PyExc_OverflowError,
10775 "replace string is too long");
10776 goto error;
10777 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010778 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010779 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010780 _Py_INCREF_UNICODE_EMPTY();
10781 if (!unicode_empty)
10782 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010783 u = unicode_empty;
10784 goto done;
10785 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010786 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010787 PyErr_SetString(PyExc_OverflowError,
10788 "replace string is too long");
10789 goto error;
10790 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010791 u = PyUnicode_New(new_size, maxchar);
10792 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010793 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010794 assert(PyUnicode_KIND(u) == rkind);
10795 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010796 ires = i = 0;
10797 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010798 while (n-- > 0) {
10799 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010800 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010801 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010802 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010803 if (j == -1)
10804 break;
10805 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010806 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010807 memcpy(res + rkind * ires,
10808 sbuf + rkind * i,
10809 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010810 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010811 }
10812 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010813 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010814 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010815 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010816 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010817 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010818 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010820 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010821 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010822 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010823 memcpy(res + rkind * ires,
10824 sbuf + rkind * i,
10825 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010826 }
10827 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010828 /* interleave */
10829 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010830 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010831 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010832 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010833 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010834 if (--n <= 0)
10835 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010836 memcpy(res + rkind * ires,
10837 sbuf + rkind * i,
10838 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010839 ires++;
10840 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010841 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010842 memcpy(res + rkind * ires,
10843 sbuf + rkind * i,
10844 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010845 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010846 }
10847
10848 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010849 unicode_adjust_maxchar(&u);
10850 if (u == NULL)
10851 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010852 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010853
10854 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010855 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10856 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10857 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010858 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010859 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010860 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010861 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010862 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010863 PyMem_FREE((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010864 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010865 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010866
Benjamin Peterson29060642009-01-31 22:14:21 +000010867 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010868 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010869 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10870 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10871 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010872 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010873 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010874 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010875 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010876 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010877 PyMem_FREE((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010878 return unicode_result_unchanged(self);
10879
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010880 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010881 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10882 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10883 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10884 if (srelease)
10885 PyMem_FREE((void *)sbuf);
10886 if (release1)
10887 PyMem_FREE((void *)buf1);
10888 if (release2)
10889 PyMem_FREE((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010890 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010891}
10892
10893/* --- Unicode Object Methods --------------------------------------------- */
10894
INADA Naoki3ae20562017-01-16 20:41:20 +090010895/*[clinic input]
10896str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010897
INADA Naoki3ae20562017-01-16 20:41:20 +090010898Return a version of the string where each word is titlecased.
10899
10900More specifically, words start with uppercased characters and all remaining
10901cased characters have lower case.
10902[clinic start generated code]*/
10903
10904static PyObject *
10905unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010906/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010907{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010908 if (PyUnicode_READY(self) == -1)
10909 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010910 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010911}
10912
INADA Naoki3ae20562017-01-16 20:41:20 +090010913/*[clinic input]
10914str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915
INADA Naoki3ae20562017-01-16 20:41:20 +090010916Return a capitalized version of the string.
10917
10918More specifically, make the first character have upper case and the rest lower
10919case.
10920[clinic start generated code]*/
10921
10922static PyObject *
10923unicode_capitalize_impl(PyObject *self)
10924/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010925{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010926 if (PyUnicode_READY(self) == -1)
10927 return NULL;
10928 if (PyUnicode_GET_LENGTH(self) == 0)
10929 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010930 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010931}
10932
INADA Naoki3ae20562017-01-16 20:41:20 +090010933/*[clinic input]
10934str.casefold as unicode_casefold
10935
10936Return a version of the string suitable for caseless comparisons.
10937[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010938
10939static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010940unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010941/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010942{
10943 if (PyUnicode_READY(self) == -1)
10944 return NULL;
10945 if (PyUnicode_IS_ASCII(self))
10946 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010947 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010948}
10949
10950
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010951/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010952
10953static int
10954convert_uc(PyObject *obj, void *addr)
10955{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010956 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010957
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010958 if (!PyUnicode_Check(obj)) {
10959 PyErr_Format(PyExc_TypeError,
10960 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010961 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010962 return 0;
10963 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010964 if (PyUnicode_READY(obj) < 0)
10965 return 0;
10966 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010967 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010968 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010969 return 0;
10970 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010971 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010972 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010973}
10974
INADA Naoki3ae20562017-01-16 20:41:20 +090010975/*[clinic input]
10976str.center as unicode_center
10977
10978 width: Py_ssize_t
10979 fillchar: Py_UCS4 = ' '
10980 /
10981
10982Return a centered string of length width.
10983
10984Padding is done using the specified fill character (default is a space).
10985[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010986
10987static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010988unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10989/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010991 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992
Benjamin Petersonbac79492012-01-14 13:34:47 -050010993 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010994 return NULL;
10995
Victor Stinnerc4b49542011-12-11 22:44:26 +010010996 if (PyUnicode_GET_LENGTH(self) >= width)
10997 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998
Victor Stinnerc4b49542011-12-11 22:44:26 +010010999 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011000 left = marg / 2 + (marg & width & 1);
11001
Victor Stinner9310abb2011-10-05 00:59:23 +020011002 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003}
11004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005/* This function assumes that str1 and str2 are readied by the caller. */
11006
Marc-André Lemburge5034372000-08-08 08:04:29 +000011007static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011008unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011009{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011010#define COMPARE(TYPE1, TYPE2) \
11011 do { \
11012 TYPE1* p1 = (TYPE1 *)data1; \
11013 TYPE2* p2 = (TYPE2 *)data2; \
11014 TYPE1* end = p1 + len; \
11015 Py_UCS4 c1, c2; \
11016 for (; p1 != end; p1++, p2++) { \
11017 c1 = *p1; \
11018 c2 = *p2; \
11019 if (c1 != c2) \
11020 return (c1 < c2) ? -1 : 1; \
11021 } \
11022 } \
11023 while (0)
11024
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011025 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011026 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011027 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011028
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011029 kind1 = PyUnicode_KIND(str1);
11030 kind2 = PyUnicode_KIND(str2);
11031 data1 = PyUnicode_DATA(str1);
11032 data2 = PyUnicode_DATA(str2);
11033 len1 = PyUnicode_GET_LENGTH(str1);
11034 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011035 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011036
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011037 switch(kind1) {
11038 case PyUnicode_1BYTE_KIND:
11039 {
11040 switch(kind2) {
11041 case PyUnicode_1BYTE_KIND:
11042 {
11043 int cmp = memcmp(data1, data2, len);
11044 /* normalize result of memcmp() into the range [-1; 1] */
11045 if (cmp < 0)
11046 return -1;
11047 if (cmp > 0)
11048 return 1;
11049 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011050 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011051 case PyUnicode_2BYTE_KIND:
11052 COMPARE(Py_UCS1, Py_UCS2);
11053 break;
11054 case PyUnicode_4BYTE_KIND:
11055 COMPARE(Py_UCS1, Py_UCS4);
11056 break;
11057 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011058 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011059 }
11060 break;
11061 }
11062 case PyUnicode_2BYTE_KIND:
11063 {
11064 switch(kind2) {
11065 case PyUnicode_1BYTE_KIND:
11066 COMPARE(Py_UCS2, Py_UCS1);
11067 break;
11068 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011069 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011070 COMPARE(Py_UCS2, Py_UCS2);
11071 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011072 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011073 case PyUnicode_4BYTE_KIND:
11074 COMPARE(Py_UCS2, Py_UCS4);
11075 break;
11076 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011077 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011078 }
11079 break;
11080 }
11081 case PyUnicode_4BYTE_KIND:
11082 {
11083 switch(kind2) {
11084 case PyUnicode_1BYTE_KIND:
11085 COMPARE(Py_UCS4, Py_UCS1);
11086 break;
11087 case PyUnicode_2BYTE_KIND:
11088 COMPARE(Py_UCS4, Py_UCS2);
11089 break;
11090 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011091 {
11092#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11093 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11094 /* normalize result of wmemcmp() into the range [-1; 1] */
11095 if (cmp < 0)
11096 return -1;
11097 if (cmp > 0)
11098 return 1;
11099#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011100 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011101#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011102 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011103 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011104 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011105 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011106 }
11107 break;
11108 }
11109 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011110 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011111 }
11112
Victor Stinner770e19e2012-10-04 22:59:45 +020011113 if (len1 == len2)
11114 return 0;
11115 if (len1 < len2)
11116 return -1;
11117 else
11118 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011119
11120#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011121}
11122
Benjamin Peterson621b4302016-09-09 13:54:34 -070011123static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011124unicode_compare_eq(PyObject *str1, PyObject *str2)
11125{
11126 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011127 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011128 Py_ssize_t len;
11129 int cmp;
11130
Victor Stinnere5567ad2012-10-23 02:48:49 +020011131 len = PyUnicode_GET_LENGTH(str1);
11132 if (PyUnicode_GET_LENGTH(str2) != len)
11133 return 0;
11134 kind = PyUnicode_KIND(str1);
11135 if (PyUnicode_KIND(str2) != kind)
11136 return 0;
11137 data1 = PyUnicode_DATA(str1);
11138 data2 = PyUnicode_DATA(str2);
11139
11140 cmp = memcmp(data1, data2, len * kind);
11141 return (cmp == 0);
11142}
11143
11144
Alexander Belopolsky40018472011-02-26 01:02:56 +000011145int
11146PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011147{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011148 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11149 if (PyUnicode_READY(left) == -1 ||
11150 PyUnicode_READY(right) == -1)
11151 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011152
11153 /* a string is equal to itself */
11154 if (left == right)
11155 return 0;
11156
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011157 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011158 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011159 PyErr_Format(PyExc_TypeError,
11160 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011161 Py_TYPE(left)->tp_name,
11162 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011163 return -1;
11164}
11165
Martin v. Löwis5b222132007-06-10 09:51:05 +000011166int
11167PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11168{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011169 Py_ssize_t i;
11170 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011171 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011172 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011173
Victor Stinner910337b2011-10-03 03:20:16 +020011174 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011175 if (!PyUnicode_IS_READY(uni)) {
11176 const wchar_t *ws = _PyUnicode_WSTR(uni);
11177 /* Compare Unicode string and source character set string */
11178 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11179 if (chr != ustr[i])
11180 return (chr < ustr[i]) ? -1 : 1;
11181 }
11182 /* This check keeps Python strings that end in '\0' from comparing equal
11183 to C strings identical up to that point. */
11184 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11185 return 1; /* uni is longer */
11186 if (ustr[i])
11187 return -1; /* str is longer */
11188 return 0;
11189 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011190 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011191 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011192 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011193 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011194 size_t len, len2 = strlen(str);
11195 int cmp;
11196
11197 len = Py_MIN(len1, len2);
11198 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011199 if (cmp != 0) {
11200 if (cmp < 0)
11201 return -1;
11202 else
11203 return 1;
11204 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011205 if (len1 > len2)
11206 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011207 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011208 return -1; /* str is longer */
11209 return 0;
11210 }
11211 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011212 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011213 /* Compare Unicode string and source character set string */
11214 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011215 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011216 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11217 /* This check keeps Python strings that end in '\0' from comparing equal
11218 to C strings identical up to that point. */
11219 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11220 return 1; /* uni is longer */
11221 if (str[i])
11222 return -1; /* str is longer */
11223 return 0;
11224 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011225}
11226
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011227static int
11228non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11229{
11230 size_t i, len;
11231 const wchar_t *p;
11232 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11233 if (strlen(str) != len)
11234 return 0;
11235 p = _PyUnicode_WSTR(unicode);
11236 assert(p);
11237 for (i = 0; i < len; i++) {
11238 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011239 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011240 return 0;
11241 }
11242 return 1;
11243}
11244
11245int
11246_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11247{
11248 size_t len;
11249 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011250 assert(str);
11251#ifndef NDEBUG
11252 for (const char *p = str; *p; p++) {
11253 assert((unsigned char)*p < 128);
11254 }
11255#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011256 if (PyUnicode_READY(unicode) == -1) {
11257 /* Memory error or bad data */
11258 PyErr_Clear();
11259 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11260 }
11261 if (!PyUnicode_IS_ASCII(unicode))
11262 return 0;
11263 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11264 return strlen(str) == len &&
11265 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11266}
11267
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011268int
11269_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11270{
11271 PyObject *right_uni;
11272 Py_hash_t hash;
11273
11274 assert(_PyUnicode_CHECK(left));
11275 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011276#ifndef NDEBUG
11277 for (const char *p = right->string; *p; p++) {
11278 assert((unsigned char)*p < 128);
11279 }
11280#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011281
11282 if (PyUnicode_READY(left) == -1) {
11283 /* memory error or bad data */
11284 PyErr_Clear();
11285 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11286 }
11287
11288 if (!PyUnicode_IS_ASCII(left))
11289 return 0;
11290
11291 right_uni = _PyUnicode_FromId(right); /* borrowed */
11292 if (right_uni == NULL) {
11293 /* memory error or bad data */
11294 PyErr_Clear();
11295 return _PyUnicode_EqualToASCIIString(left, right->string);
11296 }
11297
11298 if (left == right_uni)
11299 return 1;
11300
11301 if (PyUnicode_CHECK_INTERNED(left))
11302 return 0;
11303
INADA Naoki7cc95f52018-01-28 02:07:09 +090011304 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011305 hash = _PyUnicode_HASH(left);
11306 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11307 return 0;
11308
11309 return unicode_compare_eq(left, right_uni);
11310}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011311
Alexander Belopolsky40018472011-02-26 01:02:56 +000011312PyObject *
11313PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011314{
11315 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011316
Victor Stinnere5567ad2012-10-23 02:48:49 +020011317 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11318 Py_RETURN_NOTIMPLEMENTED;
11319
11320 if (PyUnicode_READY(left) == -1 ||
11321 PyUnicode_READY(right) == -1)
11322 return NULL;
11323
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011324 if (left == right) {
11325 switch (op) {
11326 case Py_EQ:
11327 case Py_LE:
11328 case Py_GE:
11329 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011330 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011331 case Py_NE:
11332 case Py_LT:
11333 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011334 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011335 default:
11336 PyErr_BadArgument();
11337 return NULL;
11338 }
11339 }
11340 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011341 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011342 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011343 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011344 }
11345 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011346 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011347 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011348 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011349}
11350
Alexander Belopolsky40018472011-02-26 01:02:56 +000011351int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011352_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11353{
11354 return unicode_eq(aa, bb);
11355}
11356
11357int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011358PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011359{
Victor Stinner77282cb2013-04-14 19:22:47 +020011360 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011361 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011362 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011363 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011364
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011365 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011366 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011367 "'in <string>' requires string as left operand, not %.100s",
11368 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011369 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011370 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011371 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011372 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011373 if (ensure_unicode(str) < 0)
11374 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011375
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011376 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011377 kind2 = PyUnicode_KIND(substr);
11378 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011379 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011380 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011381 len2 = PyUnicode_GET_LENGTH(substr);
11382 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011383 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011384 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011385 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011386 if (len2 == 1) {
11387 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11388 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011389 return result;
11390 }
11391 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011392 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011393 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011394 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011395 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011396
Victor Stinner77282cb2013-04-14 19:22:47 +020011397 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011398 case PyUnicode_1BYTE_KIND:
11399 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11400 break;
11401 case PyUnicode_2BYTE_KIND:
11402 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11403 break;
11404 case PyUnicode_4BYTE_KIND:
11405 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11406 break;
11407 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011408 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011409 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011410
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011411 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011412 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011413 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011414
Guido van Rossum403d68b2000-03-13 15:55:09 +000011415 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011416}
11417
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418/* Concat to string or Unicode object giving a new Unicode object. */
11419
Alexander Belopolsky40018472011-02-26 01:02:56 +000011420PyObject *
11421PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011423 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011424 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011425 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011427 if (ensure_unicode(left) < 0)
11428 return NULL;
11429
11430 if (!PyUnicode_Check(right)) {
11431 PyErr_Format(PyExc_TypeError,
11432 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011433 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011434 return NULL;
11435 }
11436 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011437 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438
11439 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011440 if (left == unicode_empty)
11441 return PyUnicode_FromObject(right);
11442 if (right == unicode_empty)
11443 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011445 left_len = PyUnicode_GET_LENGTH(left);
11446 right_len = PyUnicode_GET_LENGTH(right);
11447 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011448 PyErr_SetString(PyExc_OverflowError,
11449 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011450 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011451 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011452 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011453
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011454 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11455 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011456 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011457
Guido van Rossumd57fd912000-03-10 22:53:23 +000011458 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011459 result = PyUnicode_New(new_len, maxchar);
11460 if (result == NULL)
11461 return NULL;
11462 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11463 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11464 assert(_PyUnicode_CheckConsistency(result, 1));
11465 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466}
11467
Walter Dörwald1ab83302007-05-18 17:15:44 +000011468void
Victor Stinner23e56682011-10-03 03:54:37 +020011469PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011470{
Victor Stinner23e56682011-10-03 03:54:37 +020011471 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011472 Py_UCS4 maxchar, maxchar2;
11473 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011474
11475 if (p_left == NULL) {
11476 if (!PyErr_Occurred())
11477 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011478 return;
11479 }
Victor Stinner23e56682011-10-03 03:54:37 +020011480 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011481 if (right == NULL || left == NULL
11482 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011483 if (!PyErr_Occurred())
11484 PyErr_BadInternalCall();
11485 goto error;
11486 }
11487
Benjamin Petersonbac79492012-01-14 13:34:47 -050011488 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011489 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011490 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011491 goto error;
11492
Victor Stinner488fa492011-12-12 00:01:39 +010011493 /* Shortcuts */
11494 if (left == unicode_empty) {
11495 Py_DECREF(left);
11496 Py_INCREF(right);
11497 *p_left = right;
11498 return;
11499 }
11500 if (right == unicode_empty)
11501 return;
11502
11503 left_len = PyUnicode_GET_LENGTH(left);
11504 right_len = PyUnicode_GET_LENGTH(right);
11505 if (left_len > PY_SSIZE_T_MAX - right_len) {
11506 PyErr_SetString(PyExc_OverflowError,
11507 "strings are too large to concat");
11508 goto error;
11509 }
11510 new_len = left_len + right_len;
11511
11512 if (unicode_modifiable(left)
11513 && PyUnicode_CheckExact(right)
11514 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011515 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11516 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011517 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011518 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011519 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11520 {
11521 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011522 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011523 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011524
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011525 /* copy 'right' into the newly allocated area of 'left' */
11526 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011527 }
Victor Stinner488fa492011-12-12 00:01:39 +010011528 else {
11529 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11530 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011531 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011532
Victor Stinner488fa492011-12-12 00:01:39 +010011533 /* Concat the two Unicode strings */
11534 res = PyUnicode_New(new_len, maxchar);
11535 if (res == NULL)
11536 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011537 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11538 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011539 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011540 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011541 }
11542 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011543 return;
11544
11545error:
Victor Stinner488fa492011-12-12 00:01:39 +010011546 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011547}
11548
11549void
11550PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11551{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011552 PyUnicode_Append(pleft, right);
11553 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011554}
11555
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011556/*
11557Wraps stringlib_parse_args_finds() and additionally ensures that the
11558first argument is a unicode object.
11559*/
11560
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011561static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011562parse_args_finds_unicode(const char * function_name, PyObject *args,
11563 PyObject **substring,
11564 Py_ssize_t *start, Py_ssize_t *end)
11565{
11566 if(stringlib_parse_args_finds(function_name, args, substring,
11567 start, end)) {
11568 if (ensure_unicode(*substring) < 0)
11569 return 0;
11570 return 1;
11571 }
11572 return 0;
11573}
11574
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011575PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011576 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011578Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011579string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011580interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581
11582static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011583unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011584{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011585 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011586 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011587 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011589 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011590 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011591 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011592
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011593 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011594 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011595
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011596 kind1 = PyUnicode_KIND(self);
11597 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011598 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011599 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011601 len1 = PyUnicode_GET_LENGTH(self);
11602 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011603 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011604 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011605 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011606
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011607 buf1 = PyUnicode_DATA(self);
11608 buf2 = PyUnicode_DATA(substring);
11609 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011610 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011611 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011612 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011613 }
11614 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011615 case PyUnicode_1BYTE_KIND:
11616 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011617 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011618 buf2, len2, PY_SSIZE_T_MAX
11619 );
11620 break;
11621 case PyUnicode_2BYTE_KIND:
11622 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011623 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011624 buf2, len2, PY_SSIZE_T_MAX
11625 );
11626 break;
11627 case PyUnicode_4BYTE_KIND:
11628 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011629 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011630 buf2, len2, PY_SSIZE_T_MAX
11631 );
11632 break;
11633 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011634 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011635 }
11636
11637 result = PyLong_FromSsize_t(iresult);
11638
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011639 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011640 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011641 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011642
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643 return result;
11644}
11645
INADA Naoki3ae20562017-01-16 20:41:20 +090011646/*[clinic input]
11647str.encode as unicode_encode
11648
11649 encoding: str(c_default="NULL") = 'utf-8'
11650 The encoding in which to encode the string.
11651 errors: str(c_default="NULL") = 'strict'
11652 The error handling scheme to use for encoding errors.
11653 The default is 'strict' meaning that encoding errors raise a
11654 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11655 'xmlcharrefreplace' as well as any other name registered with
11656 codecs.register_error that can handle UnicodeEncodeErrors.
11657
11658Encode the string using the codec registered for encoding.
11659[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660
11661static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011662unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011663/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011664{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011665 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011666}
11667
INADA Naoki3ae20562017-01-16 20:41:20 +090011668/*[clinic input]
11669str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011670
INADA Naoki3ae20562017-01-16 20:41:20 +090011671 tabsize: int = 8
11672
11673Return a copy where all tab characters are expanded using spaces.
11674
11675If tabsize is not given, a tab size of 8 characters is assumed.
11676[clinic start generated code]*/
11677
11678static PyObject *
11679unicode_expandtabs_impl(PyObject *self, int tabsize)
11680/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011681{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011682 Py_ssize_t i, j, line_pos, src_len, incr;
11683 Py_UCS4 ch;
11684 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011685 const void *src_data;
11686 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011687 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011688 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689
Antoine Pitrou22425222011-10-04 19:10:51 +020011690 if (PyUnicode_READY(self) == -1)
11691 return NULL;
11692
Thomas Wouters7e474022000-07-16 12:04:32 +000011693 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011694 src_len = PyUnicode_GET_LENGTH(self);
11695 i = j = line_pos = 0;
11696 kind = PyUnicode_KIND(self);
11697 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011698 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011699 for (; i < src_len; i++) {
11700 ch = PyUnicode_READ(kind, src_data, i);
11701 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011702 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011703 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011704 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011705 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011706 goto overflow;
11707 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011708 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011709 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011710 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011711 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011712 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011713 goto overflow;
11714 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011715 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011716 if (ch == '\n' || ch == '\r')
11717 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011718 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011719 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011720 if (!found)
11721 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011722
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011724 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725 if (!u)
11726 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011727 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728
Antoine Pitroue71d5742011-10-04 15:55:09 +020011729 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011730
Antoine Pitroue71d5742011-10-04 15:55:09 +020011731 for (; i < src_len; i++) {
11732 ch = PyUnicode_READ(kind, src_data, i);
11733 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011734 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011735 incr = tabsize - (line_pos % tabsize);
11736 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011737 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011738 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011739 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011740 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011741 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011742 line_pos++;
11743 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011744 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011745 if (ch == '\n' || ch == '\r')
11746 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011748 }
11749 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011750 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011751
Antoine Pitroue71d5742011-10-04 15:55:09 +020011752 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011753 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11754 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011755}
11756
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011757PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011758 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759\n\
11760Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011761such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762arguments start and end are interpreted as in slice notation.\n\
11763\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011764Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765
11766static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011767unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011769 /* initialize variables to prevent gcc warning */
11770 PyObject *substring = NULL;
11771 Py_ssize_t start = 0;
11772 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011773 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011775 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011778 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011779 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011780
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011781 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011782
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011783 if (result == -2)
11784 return NULL;
11785
Christian Heimes217cfd12007-12-02 14:31:20 +000011786 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011787}
11788
11789static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011790unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011791{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011792 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011793 enum PyUnicode_Kind kind;
11794 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011795
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011796 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011797 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011798 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011799 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011800 if (PyUnicode_READY(self) == -1) {
11801 return NULL;
11802 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011803 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11804 PyErr_SetString(PyExc_IndexError, "string index out of range");
11805 return NULL;
11806 }
11807 kind = PyUnicode_KIND(self);
11808 data = PyUnicode_DATA(self);
11809 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011810 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811}
11812
Guido van Rossumc2504932007-09-18 19:42:40 +000011813/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011814 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011815static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011816unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011817{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011818 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011819
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011820#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011821 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011822#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011823 if (_PyUnicode_HASH(self) != -1)
11824 return _PyUnicode_HASH(self);
11825 if (PyUnicode_READY(self) == -1)
11826 return -1;
animalizea1d14252019-01-02 20:16:06 +080011827
Christian Heimes985ecdc2013-11-20 11:46:18 +010011828 x = _Py_HashBytes(PyUnicode_DATA(self),
11829 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011830 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011831 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011832}
11833
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011834PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011835 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836\n\
oldkaa0735f2018-02-02 16:52:55 +080011837Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011838such that sub is contained within S[start:end]. Optional\n\
11839arguments start and end are interpreted as in slice notation.\n\
11840\n\
11841Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842
11843static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011844unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011846 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011847 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011848 PyObject *substring = NULL;
11849 Py_ssize_t start = 0;
11850 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011851
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011852 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011855 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011856 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011857
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011858 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011860 if (result == -2)
11861 return NULL;
11862
Guido van Rossumd57fd912000-03-10 22:53:23 +000011863 if (result < 0) {
11864 PyErr_SetString(PyExc_ValueError, "substring not found");
11865 return NULL;
11866 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011867
Christian Heimes217cfd12007-12-02 14:31:20 +000011868 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011869}
11870
INADA Naoki3ae20562017-01-16 20:41:20 +090011871/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011872str.isascii as unicode_isascii
11873
11874Return True if all characters in the string are ASCII, False otherwise.
11875
11876ASCII characters have code points in the range U+0000-U+007F.
11877Empty string is ASCII too.
11878[clinic start generated code]*/
11879
11880static PyObject *
11881unicode_isascii_impl(PyObject *self)
11882/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11883{
11884 if (PyUnicode_READY(self) == -1) {
11885 return NULL;
11886 }
11887 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11888}
11889
11890/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011891str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892
INADA Naoki3ae20562017-01-16 20:41:20 +090011893Return True if the string is a lowercase string, False otherwise.
11894
11895A string is lowercase if all cased characters in the string are lowercase and
11896there is at least one cased character in the string.
11897[clinic start generated code]*/
11898
11899static PyObject *
11900unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011901/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011903 Py_ssize_t i, length;
11904 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011905 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906 int cased;
11907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011908 if (PyUnicode_READY(self) == -1)
11909 return NULL;
11910 length = PyUnicode_GET_LENGTH(self);
11911 kind = PyUnicode_KIND(self);
11912 data = PyUnicode_DATA(self);
11913
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011915 if (length == 1)
11916 return PyBool_FromLong(
11917 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011918
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011919 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011920 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011921 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011922
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011924 for (i = 0; i < length; i++) {
11925 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011926
Benjamin Peterson29060642009-01-31 22:14:21 +000011927 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011928 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011929 else if (!cased && Py_UNICODE_ISLOWER(ch))
11930 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011932 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011933}
11934
INADA Naoki3ae20562017-01-16 20:41:20 +090011935/*[clinic input]
11936str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937
INADA Naoki3ae20562017-01-16 20:41:20 +090011938Return True if the string is an uppercase string, False otherwise.
11939
11940A string is uppercase if all cased characters in the string are uppercase and
11941there is at least one cased character in the string.
11942[clinic start generated code]*/
11943
11944static PyObject *
11945unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011946/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011948 Py_ssize_t i, length;
11949 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011950 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951 int cased;
11952
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011953 if (PyUnicode_READY(self) == -1)
11954 return NULL;
11955 length = PyUnicode_GET_LENGTH(self);
11956 kind = PyUnicode_KIND(self);
11957 data = PyUnicode_DATA(self);
11958
Guido van Rossumd57fd912000-03-10 22:53:23 +000011959 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011960 if (length == 1)
11961 return PyBool_FromLong(
11962 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011963
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011964 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011966 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011967
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011969 for (i = 0; i < length; i++) {
11970 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011971
Benjamin Peterson29060642009-01-31 22:14:21 +000011972 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011973 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011974 else if (!cased && Py_UNICODE_ISUPPER(ch))
11975 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011976 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011977 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011978}
11979
INADA Naoki3ae20562017-01-16 20:41:20 +090011980/*[clinic input]
11981str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011982
INADA Naoki3ae20562017-01-16 20:41:20 +090011983Return True if the string is a title-cased string, False otherwise.
11984
11985In a title-cased string, upper- and title-case characters may only
11986follow uncased characters and lowercase characters only cased ones.
11987[clinic start generated code]*/
11988
11989static PyObject *
11990unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011991/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011993 Py_ssize_t i, length;
11994 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011995 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011996 int cased, previous_is_cased;
11997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011998 if (PyUnicode_READY(self) == -1)
11999 return NULL;
12000 length = PyUnicode_GET_LENGTH(self);
12001 kind = PyUnicode_KIND(self);
12002 data = PyUnicode_DATA(self);
12003
Guido van Rossumd57fd912000-03-10 22:53:23 +000012004 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005 if (length == 1) {
12006 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12007 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12008 (Py_UNICODE_ISUPPER(ch) != 0));
12009 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012010
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012011 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012013 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012014
Guido van Rossumd57fd912000-03-10 22:53:23 +000012015 cased = 0;
12016 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012017 for (i = 0; i < length; i++) {
12018 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012019
Benjamin Peterson29060642009-01-31 22:14:21 +000012020 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12021 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012022 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012023 previous_is_cased = 1;
12024 cased = 1;
12025 }
12026 else if (Py_UNICODE_ISLOWER(ch)) {
12027 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012028 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012029 previous_is_cased = 1;
12030 cased = 1;
12031 }
12032 else
12033 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012034 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012035 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012036}
12037
INADA Naoki3ae20562017-01-16 20:41:20 +090012038/*[clinic input]
12039str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012040
INADA Naoki3ae20562017-01-16 20:41:20 +090012041Return True if the string is a whitespace string, False otherwise.
12042
12043A string is whitespace if all characters in the string are whitespace and there
12044is at least one character in the string.
12045[clinic start generated code]*/
12046
12047static PyObject *
12048unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012049/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012051 Py_ssize_t i, length;
12052 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012053 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012054
12055 if (PyUnicode_READY(self) == -1)
12056 return NULL;
12057 length = PyUnicode_GET_LENGTH(self);
12058 kind = PyUnicode_KIND(self);
12059 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012060
Guido van Rossumd57fd912000-03-10 22:53:23 +000012061 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012062 if (length == 1)
12063 return PyBool_FromLong(
12064 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012066 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012067 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012068 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012070 for (i = 0; i < length; i++) {
12071 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012072 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012073 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012074 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012075 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076}
12077
INADA Naoki3ae20562017-01-16 20:41:20 +090012078/*[clinic input]
12079str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012080
INADA Naoki3ae20562017-01-16 20:41:20 +090012081Return True if the string is an alphabetic string, False otherwise.
12082
12083A string is alphabetic if all characters in the string are alphabetic and there
12084is at least one character in the string.
12085[clinic start generated code]*/
12086
12087static PyObject *
12088unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012089/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012090{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012091 Py_ssize_t i, length;
12092 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012093 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012094
12095 if (PyUnicode_READY(self) == -1)
12096 return NULL;
12097 length = PyUnicode_GET_LENGTH(self);
12098 kind = PyUnicode_KIND(self);
12099 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012100
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012101 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012102 if (length == 1)
12103 return PyBool_FromLong(
12104 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012105
12106 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012107 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012108 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012109
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012110 for (i = 0; i < length; i++) {
12111 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012112 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012113 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012114 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012115}
12116
INADA Naoki3ae20562017-01-16 20:41:20 +090012117/*[clinic input]
12118str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012119
INADA Naoki3ae20562017-01-16 20:41:20 +090012120Return True if the string is an alpha-numeric string, False otherwise.
12121
12122A string is alpha-numeric if all characters in the string are alpha-numeric and
12123there is at least one character in the string.
12124[clinic start generated code]*/
12125
12126static PyObject *
12127unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012128/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012129{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012130 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012131 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132 Py_ssize_t len, i;
12133
12134 if (PyUnicode_READY(self) == -1)
12135 return NULL;
12136
12137 kind = PyUnicode_KIND(self);
12138 data = PyUnicode_DATA(self);
12139 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012140
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012141 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012142 if (len == 1) {
12143 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12144 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12145 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012146
12147 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012148 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012149 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012151 for (i = 0; i < len; i++) {
12152 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012153 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012154 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012155 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012156 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012157}
12158
INADA Naoki3ae20562017-01-16 20:41:20 +090012159/*[clinic input]
12160str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012161
INADA Naoki3ae20562017-01-16 20:41:20 +090012162Return True if the string is a decimal string, False otherwise.
12163
12164A string is a decimal string if all characters in the string are decimal and
12165there is at least one character in the string.
12166[clinic start generated code]*/
12167
12168static PyObject *
12169unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012170/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012171{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012172 Py_ssize_t i, length;
12173 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012174 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012175
12176 if (PyUnicode_READY(self) == -1)
12177 return NULL;
12178 length = PyUnicode_GET_LENGTH(self);
12179 kind = PyUnicode_KIND(self);
12180 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012181
Guido van Rossumd57fd912000-03-10 22:53:23 +000012182 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012183 if (length == 1)
12184 return PyBool_FromLong(
12185 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012186
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012187 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012188 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012189 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012191 for (i = 0; i < length; i++) {
12192 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012193 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012195 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012196}
12197
INADA Naoki3ae20562017-01-16 20:41:20 +090012198/*[clinic input]
12199str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012200
INADA Naoki3ae20562017-01-16 20:41:20 +090012201Return True if the string is a digit string, False otherwise.
12202
12203A string is a digit string if all characters in the string are digits and there
12204is at least one character in the string.
12205[clinic start generated code]*/
12206
12207static PyObject *
12208unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012209/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012210{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012211 Py_ssize_t i, length;
12212 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012213 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214
12215 if (PyUnicode_READY(self) == -1)
12216 return NULL;
12217 length = PyUnicode_GET_LENGTH(self);
12218 kind = PyUnicode_KIND(self);
12219 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012220
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012222 if (length == 1) {
12223 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12224 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12225 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012226
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012227 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012228 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012229 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012231 for (i = 0; i < length; i++) {
12232 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012233 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012234 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012235 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236}
12237
INADA Naoki3ae20562017-01-16 20:41:20 +090012238/*[clinic input]
12239str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012240
INADA Naoki3ae20562017-01-16 20:41:20 +090012241Return True if the string is a numeric string, False otherwise.
12242
12243A string is numeric if all characters in the string are numeric and there is at
12244least one character in the string.
12245[clinic start generated code]*/
12246
12247static PyObject *
12248unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012249/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012250{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012251 Py_ssize_t i, length;
12252 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012253 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012254
12255 if (PyUnicode_READY(self) == -1)
12256 return NULL;
12257 length = PyUnicode_GET_LENGTH(self);
12258 kind = PyUnicode_KIND(self);
12259 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012260
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012262 if (length == 1)
12263 return PyBool_FromLong(
12264 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012266 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012267 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012268 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012270 for (i = 0; i < length; i++) {
12271 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012272 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012273 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012274 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012275}
12276
Martin v. Löwis47383402007-08-15 07:32:56 +000012277int
12278PyUnicode_IsIdentifier(PyObject *self)
12279{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012280 Py_ssize_t i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012281 int ready = PyUnicode_IS_READY(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012282
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012283 Py_ssize_t len = ready ? PyUnicode_GET_LENGTH(self) : PyUnicode_GET_SIZE(self);
12284 if (len == 0) {
12285 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012286 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012287 }
12288
Hai Shi3d235f52020-02-17 21:41:15 +080012289 int kind = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012290 const void *data = NULL;
Andy Lester933fc53f2020-02-20 22:51:47 -060012291 const wchar_t *wstr = NULL;
12292 Py_UCS4 ch;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012293 if (ready) {
12294 kind = PyUnicode_KIND(self);
12295 data = PyUnicode_DATA(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012296 ch = PyUnicode_READ(kind, data, 0);
12297 }
12298 else {
Andy Lester933fc53f2020-02-20 22:51:47 -060012299 wstr = _PyUnicode_WSTR(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012300 ch = wstr[0];
12301 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012302 /* PEP 3131 says that the first character must be in
12303 XID_Start and subsequent characters in XID_Continue,
12304 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012305 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012306 letters, digits, underscore). However, given the current
12307 definition of XID_Start and XID_Continue, it is sufficient
12308 to check just for these, except that _ must be allowed
12309 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012310 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012311 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012312 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012313
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012314 for (i = 1; i < len; i++) {
12315 if (ready) {
12316 ch = PyUnicode_READ(kind, data, i);
12317 }
12318 else {
12319 ch = wstr[i];
12320 }
12321 if (!_PyUnicode_IsXidContinue(ch)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012322 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012323 }
12324 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012325 return 1;
12326}
12327
INADA Naoki3ae20562017-01-16 20:41:20 +090012328/*[clinic input]
12329str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012330
INADA Naoki3ae20562017-01-16 20:41:20 +090012331Return True if the string is a valid Python identifier, False otherwise.
12332
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012333Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012334such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012335[clinic start generated code]*/
12336
12337static PyObject *
12338unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012339/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012340{
12341 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12342}
12343
INADA Naoki3ae20562017-01-16 20:41:20 +090012344/*[clinic input]
12345str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012346
INADA Naoki3ae20562017-01-16 20:41:20 +090012347Return True if the string is printable, False otherwise.
12348
12349A string is printable if all of its characters are considered printable in
12350repr() or if it is empty.
12351[clinic start generated code]*/
12352
12353static PyObject *
12354unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012355/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012356{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357 Py_ssize_t i, length;
12358 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012359 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012360
12361 if (PyUnicode_READY(self) == -1)
12362 return NULL;
12363 length = PyUnicode_GET_LENGTH(self);
12364 kind = PyUnicode_KIND(self);
12365 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012366
12367 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012368 if (length == 1)
12369 return PyBool_FromLong(
12370 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372 for (i = 0; i < length; i++) {
12373 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012374 Py_RETURN_FALSE;
12375 }
12376 }
12377 Py_RETURN_TRUE;
12378}
12379
INADA Naoki3ae20562017-01-16 20:41:20 +090012380/*[clinic input]
12381str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012382
INADA Naoki3ae20562017-01-16 20:41:20 +090012383 iterable: object
12384 /
12385
12386Concatenate any number of strings.
12387
Martin Panter91a88662017-01-24 00:30:06 +000012388The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012389The result is returned as a new string.
12390
12391Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12392[clinic start generated code]*/
12393
12394static PyObject *
12395unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012396/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012397{
INADA Naoki3ae20562017-01-16 20:41:20 +090012398 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012399}
12400
Martin v. Löwis18e16552006-02-15 17:27:45 +000012401static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012402unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012403{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012404 if (PyUnicode_READY(self) == -1)
12405 return -1;
12406 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012407}
12408
INADA Naoki3ae20562017-01-16 20:41:20 +090012409/*[clinic input]
12410str.ljust as unicode_ljust
12411
12412 width: Py_ssize_t
12413 fillchar: Py_UCS4 = ' '
12414 /
12415
12416Return a left-justified string of length width.
12417
12418Padding is done using the specified fill character (default is a space).
12419[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012420
12421static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012422unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12423/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012424{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012425 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012426 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012427
Victor Stinnerc4b49542011-12-11 22:44:26 +010012428 if (PyUnicode_GET_LENGTH(self) >= width)
12429 return unicode_result_unchanged(self);
12430
12431 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012432}
12433
INADA Naoki3ae20562017-01-16 20:41:20 +090012434/*[clinic input]
12435str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012436
INADA Naoki3ae20562017-01-16 20:41:20 +090012437Return a copy of the string converted to lowercase.
12438[clinic start generated code]*/
12439
12440static PyObject *
12441unicode_lower_impl(PyObject *self)
12442/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012443{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012444 if (PyUnicode_READY(self) == -1)
12445 return NULL;
12446 if (PyUnicode_IS_ASCII(self))
12447 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012448 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012449}
12450
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012451#define LEFTSTRIP 0
12452#define RIGHTSTRIP 1
12453#define BOTHSTRIP 2
12454
12455/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012456static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012457
INADA Naoki3ae20562017-01-16 20:41:20 +090012458#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012459
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012460/* externally visible for str.strip(unicode) */
12461PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012462_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012463{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012464 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012465 int kind;
12466 Py_ssize_t i, j, len;
12467 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012468 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012470 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12471 return NULL;
12472
12473 kind = PyUnicode_KIND(self);
12474 data = PyUnicode_DATA(self);
12475 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012476 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012477 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12478 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012479 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012480
Benjamin Peterson14339b62009-01-31 16:36:08 +000012481 i = 0;
12482 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012483 while (i < len) {
12484 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12485 if (!BLOOM(sepmask, ch))
12486 break;
12487 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12488 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012489 i++;
12490 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012491 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012492
Benjamin Peterson14339b62009-01-31 16:36:08 +000012493 j = len;
12494 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012495 j--;
12496 while (j >= i) {
12497 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12498 if (!BLOOM(sepmask, ch))
12499 break;
12500 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12501 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012502 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012503 }
12504
Benjamin Peterson29060642009-01-31 22:14:21 +000012505 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012506 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012507
Victor Stinner7931d9a2011-11-04 00:22:48 +010012508 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012509}
12510
12511PyObject*
12512PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12513{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012514 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012515 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012516 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012517
Victor Stinnerde636f32011-10-01 03:55:54 +020012518 if (PyUnicode_READY(self) == -1)
12519 return NULL;
12520
Victor Stinner684d5fd2012-05-03 02:32:34 +020012521 length = PyUnicode_GET_LENGTH(self);
12522 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012523
Victor Stinner684d5fd2012-05-03 02:32:34 +020012524 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012525 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012526
Victor Stinnerde636f32011-10-01 03:55:54 +020012527 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012528 PyErr_SetString(PyExc_IndexError, "string index out of range");
12529 return NULL;
12530 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012531 if (start >= length || end < start)
12532 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012533
Victor Stinner684d5fd2012-05-03 02:32:34 +020012534 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012535 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012536 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012537 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012538 }
12539 else {
12540 kind = PyUnicode_KIND(self);
12541 data = PyUnicode_1BYTE_DATA(self);
12542 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012543 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012544 length);
12545 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012546}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012547
12548static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012549do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012550{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012551 Py_ssize_t len, i, j;
12552
12553 if (PyUnicode_READY(self) == -1)
12554 return NULL;
12555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012556 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012557
Victor Stinnercc7af722013-04-09 22:39:24 +020012558 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012559 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012560
12561 i = 0;
12562 if (striptype != RIGHTSTRIP) {
12563 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012564 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012565 if (!_Py_ascii_whitespace[ch])
12566 break;
12567 i++;
12568 }
12569 }
12570
12571 j = len;
12572 if (striptype != LEFTSTRIP) {
12573 j--;
12574 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012575 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012576 if (!_Py_ascii_whitespace[ch])
12577 break;
12578 j--;
12579 }
12580 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012581 }
12582 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012583 else {
12584 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012585 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012586
Victor Stinnercc7af722013-04-09 22:39:24 +020012587 i = 0;
12588 if (striptype != RIGHTSTRIP) {
12589 while (i < len) {
12590 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12591 if (!Py_UNICODE_ISSPACE(ch))
12592 break;
12593 i++;
12594 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012595 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012596
12597 j = len;
12598 if (striptype != LEFTSTRIP) {
12599 j--;
12600 while (j >= i) {
12601 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12602 if (!Py_UNICODE_ISSPACE(ch))
12603 break;
12604 j--;
12605 }
12606 j++;
12607 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012608 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012609
Victor Stinner7931d9a2011-11-04 00:22:48 +010012610 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012611}
12612
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012613
12614static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012615do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012616{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012617 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012618 if (PyUnicode_Check(sep))
12619 return _PyUnicode_XStrip(self, striptype, sep);
12620 else {
12621 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012622 "%s arg must be None or str",
12623 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012624 return NULL;
12625 }
12626 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012627
Benjamin Peterson14339b62009-01-31 16:36:08 +000012628 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012629}
12630
12631
INADA Naoki3ae20562017-01-16 20:41:20 +090012632/*[clinic input]
12633str.strip as unicode_strip
12634
12635 chars: object = None
12636 /
12637
Zachary Ware09895c22019-10-09 16:09:00 -050012638Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012639
12640If chars is given and not None, remove characters in chars instead.
12641[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012642
12643static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012644unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012645/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012646{
INADA Naoki3ae20562017-01-16 20:41:20 +090012647 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012648}
12649
12650
INADA Naoki3ae20562017-01-16 20:41:20 +090012651/*[clinic input]
12652str.lstrip as unicode_lstrip
12653
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012654 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012655 /
12656
12657Return a copy of the string with leading whitespace removed.
12658
12659If chars is given and not None, remove characters in chars instead.
12660[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012661
12662static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012663unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012664/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012665{
INADA Naoki3ae20562017-01-16 20:41:20 +090012666 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012667}
12668
12669
INADA Naoki3ae20562017-01-16 20:41:20 +090012670/*[clinic input]
12671str.rstrip as unicode_rstrip
12672
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012673 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012674 /
12675
12676Return a copy of the string with trailing whitespace removed.
12677
12678If chars is given and not None, remove characters in chars instead.
12679[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012680
12681static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012682unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012683/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012684{
INADA Naoki3ae20562017-01-16 20:41:20 +090012685 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012686}
12687
12688
Guido van Rossumd57fd912000-03-10 22:53:23 +000012689static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012690unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012691{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012692 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012693 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012694
Serhiy Storchaka05997252013-01-26 12:14:02 +020012695 if (len < 1)
12696 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012697
Victor Stinnerc4b49542011-12-11 22:44:26 +010012698 /* no repeat, return original string */
12699 if (len == 1)
12700 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012701
Benjamin Petersonbac79492012-01-14 13:34:47 -050012702 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012703 return NULL;
12704
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012705 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012706 PyErr_SetString(PyExc_OverflowError,
12707 "repeated string is too long");
12708 return NULL;
12709 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012710 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012711
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012712 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012713 if (!u)
12714 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012715 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012717 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012718 int kind = PyUnicode_KIND(str);
12719 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012720 if (kind == PyUnicode_1BYTE_KIND) {
12721 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012722 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012723 }
12724 else if (kind == PyUnicode_2BYTE_KIND) {
12725 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012726 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012727 ucs2[n] = fill_char;
12728 } else {
12729 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12730 assert(kind == PyUnicode_4BYTE_KIND);
12731 for (n = 0; n < len; ++n)
12732 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012733 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012734 }
12735 else {
12736 /* number of characters copied this far */
12737 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012738 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012739 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012740 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012741 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012742 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012743 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012744 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012745 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012746 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012747 }
12748
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012749 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012750 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012751}
12752
Alexander Belopolsky40018472011-02-26 01:02:56 +000012753PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012754PyUnicode_Replace(PyObject *str,
12755 PyObject *substr,
12756 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012757 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012758{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012759 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12760 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012761 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012762 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012763}
12764
INADA Naoki3ae20562017-01-16 20:41:20 +090012765/*[clinic input]
12766str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012767
INADA Naoki3ae20562017-01-16 20:41:20 +090012768 old: unicode
12769 new: unicode
12770 count: Py_ssize_t = -1
12771 Maximum number of occurrences to replace.
12772 -1 (the default value) means replace all occurrences.
12773 /
12774
12775Return a copy with all occurrences of substring old replaced by new.
12776
12777If the optional argument count is given, only the first count occurrences are
12778replaced.
12779[clinic start generated code]*/
12780
12781static PyObject *
12782unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12783 Py_ssize_t count)
12784/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012785{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012786 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012787 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012788 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012789}
12790
Alexander Belopolsky40018472011-02-26 01:02:56 +000012791static PyObject *
12792unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012793{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012794 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012795 Py_ssize_t isize;
12796 Py_ssize_t osize, squote, dquote, i, o;
12797 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012798 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012799 const void *idata;
12800 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012802 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012803 return NULL;
12804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012805 isize = PyUnicode_GET_LENGTH(unicode);
12806 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012808 /* Compute length of output, quote characters, and
12809 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012810 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012811 max = 127;
12812 squote = dquote = 0;
12813 ikind = PyUnicode_KIND(unicode);
12814 for (i = 0; i < isize; i++) {
12815 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012816 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012817 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012818 case '\'': squote++; break;
12819 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012820 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012821 incr = 2;
12822 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012823 default:
12824 /* Fast-path ASCII */
12825 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012826 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012827 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012828 ;
12829 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012830 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012831 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012832 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012833 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012834 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012835 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012836 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012837 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012838 if (osize > PY_SSIZE_T_MAX - incr) {
12839 PyErr_SetString(PyExc_OverflowError,
12840 "string is too long to generate repr");
12841 return NULL;
12842 }
12843 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012844 }
12845
12846 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012847 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012848 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012849 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012850 if (dquote)
12851 /* Both squote and dquote present. Use squote,
12852 and escape them */
12853 osize += squote;
12854 else
12855 quote = '"';
12856 }
Victor Stinner55c08782013-04-14 18:45:39 +020012857 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012858
12859 repr = PyUnicode_New(osize, max);
12860 if (repr == NULL)
12861 return NULL;
12862 okind = PyUnicode_KIND(repr);
12863 odata = PyUnicode_DATA(repr);
12864
12865 PyUnicode_WRITE(okind, odata, 0, quote);
12866 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012867 if (unchanged) {
12868 _PyUnicode_FastCopyCharacters(repr, 1,
12869 unicode, 0,
12870 isize);
12871 }
12872 else {
12873 for (i = 0, o = 1; i < isize; i++) {
12874 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012875
Victor Stinner55c08782013-04-14 18:45:39 +020012876 /* Escape quotes and backslashes */
12877 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012878 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012879 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012880 continue;
12881 }
12882
12883 /* Map special whitespace to '\t', \n', '\r' */
12884 if (ch == '\t') {
12885 PyUnicode_WRITE(okind, odata, o++, '\\');
12886 PyUnicode_WRITE(okind, odata, o++, 't');
12887 }
12888 else if (ch == '\n') {
12889 PyUnicode_WRITE(okind, odata, o++, '\\');
12890 PyUnicode_WRITE(okind, odata, o++, 'n');
12891 }
12892 else if (ch == '\r') {
12893 PyUnicode_WRITE(okind, odata, o++, '\\');
12894 PyUnicode_WRITE(okind, odata, o++, 'r');
12895 }
12896
12897 /* Map non-printable US ASCII to '\xhh' */
12898 else if (ch < ' ' || ch == 0x7F) {
12899 PyUnicode_WRITE(okind, odata, o++, '\\');
12900 PyUnicode_WRITE(okind, odata, o++, 'x');
12901 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12902 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12903 }
12904
12905 /* Copy ASCII characters as-is */
12906 else if (ch < 0x7F) {
12907 PyUnicode_WRITE(okind, odata, o++, ch);
12908 }
12909
12910 /* Non-ASCII characters */
12911 else {
12912 /* Map Unicode whitespace and control characters
12913 (categories Z* and C* except ASCII space)
12914 */
12915 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12916 PyUnicode_WRITE(okind, odata, o++, '\\');
12917 /* Map 8-bit characters to '\xhh' */
12918 if (ch <= 0xff) {
12919 PyUnicode_WRITE(okind, odata, o++, 'x');
12920 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12921 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12922 }
12923 /* Map 16-bit characters to '\uxxxx' */
12924 else if (ch <= 0xffff) {
12925 PyUnicode_WRITE(okind, odata, o++, 'u');
12926 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12927 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12928 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12929 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12930 }
12931 /* Map 21-bit characters to '\U00xxxxxx' */
12932 else {
12933 PyUnicode_WRITE(okind, odata, o++, 'U');
12934 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12935 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12936 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12937 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12938 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12939 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12940 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12941 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12942 }
12943 }
12944 /* Copy characters as-is */
12945 else {
12946 PyUnicode_WRITE(okind, odata, o++, ch);
12947 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012948 }
12949 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012950 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012951 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012952 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012953 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012954}
12955
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012956PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012957 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012958\n\
12959Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012960such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012961arguments start and end are interpreted as in slice notation.\n\
12962\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012963Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012964
12965static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012966unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012967{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012968 /* initialize variables to prevent gcc warning */
12969 PyObject *substring = NULL;
12970 Py_ssize_t start = 0;
12971 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012972 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012973
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012974 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012975 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012976
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012977 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012978 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012979
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012980 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012982 if (result == -2)
12983 return NULL;
12984
Christian Heimes217cfd12007-12-02 14:31:20 +000012985 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012986}
12987
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012988PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012989 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012990\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012991Return the highest index in S where substring sub is found,\n\
12992such that sub is contained within S[start:end]. Optional\n\
12993arguments start and end are interpreted as in slice notation.\n\
12994\n\
12995Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012996
12997static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012998unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012999{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013000 /* initialize variables to prevent gcc warning */
13001 PyObject *substring = NULL;
13002 Py_ssize_t start = 0;
13003 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013004 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013005
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013006 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013007 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013008
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013009 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013010 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013011
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013012 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013014 if (result == -2)
13015 return NULL;
13016
Guido van Rossumd57fd912000-03-10 22:53:23 +000013017 if (result < 0) {
13018 PyErr_SetString(PyExc_ValueError, "substring not found");
13019 return NULL;
13020 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013021
Christian Heimes217cfd12007-12-02 14:31:20 +000013022 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013023}
13024
INADA Naoki3ae20562017-01-16 20:41:20 +090013025/*[clinic input]
13026str.rjust as unicode_rjust
13027
13028 width: Py_ssize_t
13029 fillchar: Py_UCS4 = ' '
13030 /
13031
13032Return a right-justified string of length width.
13033
13034Padding is done using the specified fill character (default is a space).
13035[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013036
13037static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013038unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13039/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013040{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013041 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013042 return NULL;
13043
Victor Stinnerc4b49542011-12-11 22:44:26 +010013044 if (PyUnicode_GET_LENGTH(self) >= width)
13045 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013046
Victor Stinnerc4b49542011-12-11 22:44:26 +010013047 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013048}
13049
Alexander Belopolsky40018472011-02-26 01:02:56 +000013050PyObject *
13051PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013052{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013053 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013054 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013055
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013056 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013057}
13058
INADA Naoki3ae20562017-01-16 20:41:20 +090013059/*[clinic input]
13060str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013061
INADA Naoki3ae20562017-01-16 20:41:20 +090013062 sep: object = None
13063 The delimiter according which to split the string.
13064 None (the default value) means split according to any whitespace,
13065 and discard empty strings from the result.
13066 maxsplit: Py_ssize_t = -1
13067 Maximum number of splits to do.
13068 -1 (the default value) means no limit.
13069
13070Return a list of the words in the string, using sep as the delimiter string.
13071[clinic start generated code]*/
13072
13073static PyObject *
13074unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13075/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013076{
INADA Naoki3ae20562017-01-16 20:41:20 +090013077 if (sep == Py_None)
13078 return split(self, NULL, maxsplit);
13079 if (PyUnicode_Check(sep))
13080 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013081
Victor Stinner998b8062018-09-12 00:23:25 +020013082 PyErr_Format(PyExc_TypeError,
13083 "must be str or None, not %.100s",
13084 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013085 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013086}
13087
Thomas Wouters477c8d52006-05-27 19:21:47 +000013088PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013089PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013090{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013091 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013092 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013093 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013094 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013095
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013096 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013097 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013098
Victor Stinner14f8f022011-10-05 20:58:25 +020013099 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013100 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013101 len1 = PyUnicode_GET_LENGTH(str_obj);
13102 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013103 if (kind1 < kind2 || len1 < len2) {
13104 _Py_INCREF_UNICODE_EMPTY();
13105 if (!unicode_empty)
13106 out = NULL;
13107 else {
13108 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
13109 Py_DECREF(unicode_empty);
13110 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013111 return out;
13112 }
13113 buf1 = PyUnicode_DATA(str_obj);
13114 buf2 = PyUnicode_DATA(sep_obj);
13115 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013116 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013117 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013118 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013119 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013120
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013121 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013122 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013123 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13124 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13125 else
13126 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013127 break;
13128 case PyUnicode_2BYTE_KIND:
13129 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13130 break;
13131 case PyUnicode_4BYTE_KIND:
13132 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13133 break;
13134 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013135 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013136 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013137
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013138 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013139 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013140 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013141
13142 return out;
13143}
13144
13145
13146PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013147PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013148{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013149 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013150 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013151 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013152 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013153
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013154 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013155 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013156
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013157 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013158 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013159 len1 = PyUnicode_GET_LENGTH(str_obj);
13160 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013161 if (kind1 < kind2 || len1 < len2) {
13162 _Py_INCREF_UNICODE_EMPTY();
13163 if (!unicode_empty)
13164 out = NULL;
13165 else {
13166 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13167 Py_DECREF(unicode_empty);
13168 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013169 return out;
13170 }
13171 buf1 = PyUnicode_DATA(str_obj);
13172 buf2 = PyUnicode_DATA(sep_obj);
13173 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013174 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013175 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013176 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013177 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013178
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013179 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013180 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013181 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13182 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13183 else
13184 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013185 break;
13186 case PyUnicode_2BYTE_KIND:
13187 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13188 break;
13189 case PyUnicode_4BYTE_KIND:
13190 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13191 break;
13192 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013193 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013194 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013195
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013196 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013197 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013198 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013199
13200 return out;
13201}
13202
INADA Naoki3ae20562017-01-16 20:41:20 +090013203/*[clinic input]
13204str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013205
INADA Naoki3ae20562017-01-16 20:41:20 +090013206 sep: object
13207 /
13208
13209Partition the string into three parts using the given separator.
13210
13211This will search for the separator in the string. If the separator is found,
13212returns a 3-tuple containing the part before the separator, the separator
13213itself, and the part after it.
13214
13215If the separator is not found, returns a 3-tuple containing the original string
13216and two empty strings.
13217[clinic start generated code]*/
13218
13219static PyObject *
13220unicode_partition(PyObject *self, PyObject *sep)
13221/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013222{
INADA Naoki3ae20562017-01-16 20:41:20 +090013223 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013224}
13225
INADA Naoki3ae20562017-01-16 20:41:20 +090013226/*[clinic input]
13227str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013228
INADA Naoki3ae20562017-01-16 20:41:20 +090013229Partition the string into three parts using the given separator.
13230
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013231This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013232the separator is found, returns a 3-tuple containing the part before the
13233separator, the separator itself, and the part after it.
13234
13235If the separator is not found, returns a 3-tuple containing two empty strings
13236and the original string.
13237[clinic start generated code]*/
13238
13239static PyObject *
13240unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013241/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013242{
INADA Naoki3ae20562017-01-16 20:41:20 +090013243 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013244}
13245
Alexander Belopolsky40018472011-02-26 01:02:56 +000013246PyObject *
13247PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013248{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013249 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013250 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013251
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013252 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013253}
13254
INADA Naoki3ae20562017-01-16 20:41:20 +090013255/*[clinic input]
13256str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013257
INADA Naoki3ae20562017-01-16 20:41:20 +090013258Return a list of the words in the string, using sep as the delimiter string.
13259
13260Splits are done starting at the end of the string and working to the front.
13261[clinic start generated code]*/
13262
13263static PyObject *
13264unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13265/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013266{
INADA Naoki3ae20562017-01-16 20:41:20 +090013267 if (sep == Py_None)
13268 return rsplit(self, NULL, maxsplit);
13269 if (PyUnicode_Check(sep))
13270 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013271
Victor Stinner998b8062018-09-12 00:23:25 +020013272 PyErr_Format(PyExc_TypeError,
13273 "must be str or None, not %.100s",
13274 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013275 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013276}
13277
INADA Naoki3ae20562017-01-16 20:41:20 +090013278/*[clinic input]
13279str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013280
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013281 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013282
13283Return a list of the lines in the string, breaking at line boundaries.
13284
13285Line breaks are not included in the resulting list unless keepends is given and
13286true.
13287[clinic start generated code]*/
13288
13289static PyObject *
13290unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013291/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013292{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013293 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013294}
13295
13296static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013297PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013298{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013299 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013300}
13301
INADA Naoki3ae20562017-01-16 20:41:20 +090013302/*[clinic input]
13303str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013304
INADA Naoki3ae20562017-01-16 20:41:20 +090013305Convert uppercase characters to lowercase and lowercase characters to uppercase.
13306[clinic start generated code]*/
13307
13308static PyObject *
13309unicode_swapcase_impl(PyObject *self)
13310/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013311{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013312 if (PyUnicode_READY(self) == -1)
13313 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013314 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013315}
13316
Larry Hastings61272b72014-01-07 12:41:53 -080013317/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013318
Larry Hastings31826802013-10-19 00:09:25 -070013319@staticmethod
13320str.maketrans as unicode_maketrans
13321
13322 x: object
13323
13324 y: unicode=NULL
13325
13326 z: unicode=NULL
13327
13328 /
13329
13330Return a translation table usable for str.translate().
13331
13332If there is only one argument, it must be a dictionary mapping Unicode
13333ordinals (integers) or characters to Unicode ordinals, strings or None.
13334Character keys will be then converted to ordinals.
13335If there are two arguments, they must be strings of equal length, and
13336in the resulting dictionary, each character in x will be mapped to the
13337character at the same position in y. If there is a third argument, it
13338must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013339[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013340
Larry Hastings31826802013-10-19 00:09:25 -070013341static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013342unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013343/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013344{
Georg Brandlceee0772007-11-27 23:48:05 +000013345 PyObject *new = NULL, *key, *value;
13346 Py_ssize_t i = 0;
13347 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013348
Georg Brandlceee0772007-11-27 23:48:05 +000013349 new = PyDict_New();
13350 if (!new)
13351 return NULL;
13352 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013353 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013354 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013355
Georg Brandlceee0772007-11-27 23:48:05 +000013356 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013357 if (!PyUnicode_Check(x)) {
13358 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13359 "be a string if there is a second argument");
13360 goto err;
13361 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013362 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013363 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13364 "arguments must have equal length");
13365 goto err;
13366 }
13367 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013368 x_kind = PyUnicode_KIND(x);
13369 y_kind = PyUnicode_KIND(y);
13370 x_data = PyUnicode_DATA(x);
13371 y_data = PyUnicode_DATA(y);
13372 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13373 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013374 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013375 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013376 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013377 if (!value) {
13378 Py_DECREF(key);
13379 goto err;
13380 }
Georg Brandlceee0772007-11-27 23:48:05 +000013381 res = PyDict_SetItem(new, key, value);
13382 Py_DECREF(key);
13383 Py_DECREF(value);
13384 if (res < 0)
13385 goto err;
13386 }
13387 /* create entries for deleting chars in z */
13388 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013389 z_kind = PyUnicode_KIND(z);
13390 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013391 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013392 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013393 if (!key)
13394 goto err;
13395 res = PyDict_SetItem(new, key, Py_None);
13396 Py_DECREF(key);
13397 if (res < 0)
13398 goto err;
13399 }
13400 }
13401 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013402 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013403 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013404
Georg Brandlceee0772007-11-27 23:48:05 +000013405 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013406 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013407 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13408 "to maketrans it must be a dict");
13409 goto err;
13410 }
13411 /* copy entries into the new dict, converting string keys to int keys */
13412 while (PyDict_Next(x, &i, &key, &value)) {
13413 if (PyUnicode_Check(key)) {
13414 /* convert string keys to integer keys */
13415 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013416 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013417 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13418 "table must be of length 1");
13419 goto err;
13420 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013421 kind = PyUnicode_KIND(key);
13422 data = PyUnicode_DATA(key);
13423 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013424 if (!newkey)
13425 goto err;
13426 res = PyDict_SetItem(new, newkey, value);
13427 Py_DECREF(newkey);
13428 if (res < 0)
13429 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013430 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013431 /* just keep integer keys */
13432 if (PyDict_SetItem(new, key, value) < 0)
13433 goto err;
13434 } else {
13435 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13436 "be strings or integers");
13437 goto err;
13438 }
13439 }
13440 }
13441 return new;
13442 err:
13443 Py_DECREF(new);
13444 return NULL;
13445}
13446
INADA Naoki3ae20562017-01-16 20:41:20 +090013447/*[clinic input]
13448str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013449
INADA Naoki3ae20562017-01-16 20:41:20 +090013450 table: object
13451 Translation table, which must be a mapping of Unicode ordinals to
13452 Unicode ordinals, strings, or None.
13453 /
13454
13455Replace each character in the string using the given translation table.
13456
13457The table must implement lookup/indexing via __getitem__, for instance a
13458dictionary or list. If this operation raises LookupError, the character is
13459left untouched. Characters mapped to None are deleted.
13460[clinic start generated code]*/
13461
13462static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013463unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013464/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013465{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013466 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013467}
13468
INADA Naoki3ae20562017-01-16 20:41:20 +090013469/*[clinic input]
13470str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013471
INADA Naoki3ae20562017-01-16 20:41:20 +090013472Return a copy of the string converted to uppercase.
13473[clinic start generated code]*/
13474
13475static PyObject *
13476unicode_upper_impl(PyObject *self)
13477/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013478{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013479 if (PyUnicode_READY(self) == -1)
13480 return NULL;
13481 if (PyUnicode_IS_ASCII(self))
13482 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013483 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013484}
13485
INADA Naoki3ae20562017-01-16 20:41:20 +090013486/*[clinic input]
13487str.zfill as unicode_zfill
13488
13489 width: Py_ssize_t
13490 /
13491
13492Pad a numeric string with zeros on the left, to fill a field of the given width.
13493
13494The string is never truncated.
13495[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013496
13497static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013498unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013499/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013500{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013501 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013502 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013503 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013504 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013505 Py_UCS4 chr;
13506
Benjamin Petersonbac79492012-01-14 13:34:47 -050013507 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013508 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013509
Victor Stinnerc4b49542011-12-11 22:44:26 +010013510 if (PyUnicode_GET_LENGTH(self) >= width)
13511 return unicode_result_unchanged(self);
13512
13513 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013514
13515 u = pad(self, fill, 0, '0');
13516
Walter Dörwald068325e2002-04-15 13:36:47 +000013517 if (u == NULL)
13518 return NULL;
13519
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013520 kind = PyUnicode_KIND(u);
13521 data = PyUnicode_DATA(u);
13522 chr = PyUnicode_READ(kind, data, fill);
13523
13524 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013525 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013526 PyUnicode_WRITE(kind, data, 0, chr);
13527 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013528 }
13529
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013530 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013531 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013532}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013533
13534#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013535static PyObject *
13536unicode__decimal2ascii(PyObject *self)
13537{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013538 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013539}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013540#endif
13541
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013542PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013543 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013544\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013545Return True if S starts with the specified prefix, False otherwise.\n\
13546With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013547With optional end, stop comparing S at that position.\n\
13548prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013549
13550static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013551unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013552 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013553{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013554 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013555 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013556 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013557 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013558 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013559
Jesus Ceaac451502011-04-20 17:09:23 +020013560 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013561 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013562 if (PyTuple_Check(subobj)) {
13563 Py_ssize_t i;
13564 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013565 substring = PyTuple_GET_ITEM(subobj, i);
13566 if (!PyUnicode_Check(substring)) {
13567 PyErr_Format(PyExc_TypeError,
13568 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013569 "not %.100s",
13570 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013571 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013572 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013573 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013574 if (result == -1)
13575 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013576 if (result) {
13577 Py_RETURN_TRUE;
13578 }
13579 }
13580 /* nothing matched */
13581 Py_RETURN_FALSE;
13582 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013583 if (!PyUnicode_Check(subobj)) {
13584 PyErr_Format(PyExc_TypeError,
13585 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013586 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013587 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013588 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013589 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013590 if (result == -1)
13591 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013592 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013593}
13594
13595
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013596PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013597 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013598\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013599Return True if S ends with the specified suffix, False otherwise.\n\
13600With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013601With optional end, stop comparing S at that position.\n\
13602suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013603
13604static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013605unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013606 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013607{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013608 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013609 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013610 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013611 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013612 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013613
Jesus Ceaac451502011-04-20 17:09:23 +020013614 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013615 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013616 if (PyTuple_Check(subobj)) {
13617 Py_ssize_t i;
13618 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013619 substring = PyTuple_GET_ITEM(subobj, i);
13620 if (!PyUnicode_Check(substring)) {
13621 PyErr_Format(PyExc_TypeError,
13622 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013623 "not %.100s",
13624 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013625 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013626 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013627 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013628 if (result == -1)
13629 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013630 if (result) {
13631 Py_RETURN_TRUE;
13632 }
13633 }
13634 Py_RETURN_FALSE;
13635 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013636 if (!PyUnicode_Check(subobj)) {
13637 PyErr_Format(PyExc_TypeError,
13638 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013639 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013640 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013641 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013642 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013643 if (result == -1)
13644 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013645 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013646}
13647
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013648static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013649_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013650{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013651 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13652 writer->data = PyUnicode_DATA(writer->buffer);
13653
13654 if (!writer->readonly) {
13655 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013656 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013657 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013658 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013659 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13660 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13661 writer->kind = PyUnicode_WCHAR_KIND;
13662 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13663
Victor Stinner8f674cc2013-04-17 23:02:17 +020013664 /* Copy-on-write mode: set buffer size to 0 so
13665 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13666 * next write. */
13667 writer->size = 0;
13668 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013669}
13670
Victor Stinnerd3f08822012-05-29 12:57:52 +020013671void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013672_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013673{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013674 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013675
13676 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013677 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013678
13679 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13680 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13681 writer->kind = PyUnicode_WCHAR_KIND;
13682 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013683}
13684
Inada Naoki770847a2019-06-24 12:30:24 +090013685// Initialize _PyUnicodeWriter with initial buffer
13686static inline void
13687_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13688{
13689 memset(writer, 0, sizeof(*writer));
13690 writer->buffer = buffer;
13691 _PyUnicodeWriter_Update(writer);
13692 writer->min_length = writer->size;
13693}
13694
Victor Stinnerd3f08822012-05-29 12:57:52 +020013695int
13696_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13697 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013698{
13699 Py_ssize_t newlen;
13700 PyObject *newbuffer;
13701
Victor Stinner2740e462016-09-06 16:58:36 -070013702 assert(maxchar <= MAX_UNICODE);
13703
Victor Stinnerca9381e2015-09-22 00:58:32 +020013704 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013705 assert((maxchar > writer->maxchar && length >= 0)
13706 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013707
Victor Stinner202fdca2012-05-07 12:47:02 +020013708 if (length > PY_SSIZE_T_MAX - writer->pos) {
13709 PyErr_NoMemory();
13710 return -1;
13711 }
13712 newlen = writer->pos + length;
13713
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013714 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013715
Victor Stinnerd3f08822012-05-29 12:57:52 +020013716 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013717 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013718 if (writer->overallocate
13719 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13720 /* overallocate to limit the number of realloc() */
13721 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013722 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013723 if (newlen < writer->min_length)
13724 newlen = writer->min_length;
13725
Victor Stinnerd3f08822012-05-29 12:57:52 +020013726 writer->buffer = PyUnicode_New(newlen, maxchar);
13727 if (writer->buffer == NULL)
13728 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013729 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013730 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013731 if (writer->overallocate
13732 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13733 /* overallocate to limit the number of realloc() */
13734 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013735 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013736 if (newlen < writer->min_length)
13737 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013738
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013739 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013740 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013741 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013742 newbuffer = PyUnicode_New(newlen, maxchar);
13743 if (newbuffer == NULL)
13744 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013745 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13746 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013747 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013748 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013749 }
13750 else {
13751 newbuffer = resize_compact(writer->buffer, newlen);
13752 if (newbuffer == NULL)
13753 return -1;
13754 }
13755 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013756 }
13757 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013758 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013759 newbuffer = PyUnicode_New(writer->size, maxchar);
13760 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013761 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013762 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13763 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013764 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013765 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013766 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013767 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013768
13769#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013770}
13771
Victor Stinnerca9381e2015-09-22 00:58:32 +020013772int
13773_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13774 enum PyUnicode_Kind kind)
13775{
13776 Py_UCS4 maxchar;
13777
13778 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13779 assert(writer->kind < kind);
13780
13781 switch (kind)
13782 {
13783 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13784 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13785 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13786 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013787 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013788 }
13789
13790 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13791}
13792
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013793static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013794_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013795{
Victor Stinner2740e462016-09-06 16:58:36 -070013796 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013797 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13798 return -1;
13799 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13800 writer->pos++;
13801 return 0;
13802}
13803
13804int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013805_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13806{
13807 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13808}
13809
13810int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013811_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13812{
13813 Py_UCS4 maxchar;
13814 Py_ssize_t len;
13815
13816 if (PyUnicode_READY(str) == -1)
13817 return -1;
13818 len = PyUnicode_GET_LENGTH(str);
13819 if (len == 0)
13820 return 0;
13821 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13822 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013823 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013824 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013825 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013826 Py_INCREF(str);
13827 writer->buffer = str;
13828 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013829 writer->pos += len;
13830 return 0;
13831 }
13832 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13833 return -1;
13834 }
13835 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13836 str, 0, len);
13837 writer->pos += len;
13838 return 0;
13839}
13840
Victor Stinnere215d962012-10-06 23:03:36 +020013841int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013842_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13843 Py_ssize_t start, Py_ssize_t end)
13844{
13845 Py_UCS4 maxchar;
13846 Py_ssize_t len;
13847
13848 if (PyUnicode_READY(str) == -1)
13849 return -1;
13850
13851 assert(0 <= start);
13852 assert(end <= PyUnicode_GET_LENGTH(str));
13853 assert(start <= end);
13854
13855 if (end == 0)
13856 return 0;
13857
13858 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13859 return _PyUnicodeWriter_WriteStr(writer, str);
13860
13861 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13862 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13863 else
13864 maxchar = writer->maxchar;
13865 len = end - start;
13866
13867 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13868 return -1;
13869
13870 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13871 str, start, len);
13872 writer->pos += len;
13873 return 0;
13874}
13875
13876int
Victor Stinner4a587072013-11-19 12:54:53 +010013877_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13878 const char *ascii, Py_ssize_t len)
13879{
13880 if (len == -1)
13881 len = strlen(ascii);
13882
Andy Lestere6be9b52020-02-11 20:28:35 -060013883 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010013884
13885 if (writer->buffer == NULL && !writer->overallocate) {
13886 PyObject *str;
13887
13888 str = _PyUnicode_FromASCII(ascii, len);
13889 if (str == NULL)
13890 return -1;
13891
13892 writer->readonly = 1;
13893 writer->buffer = str;
13894 _PyUnicodeWriter_Update(writer);
13895 writer->pos += len;
13896 return 0;
13897 }
13898
13899 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13900 return -1;
13901
13902 switch (writer->kind)
13903 {
13904 case PyUnicode_1BYTE_KIND:
13905 {
13906 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13907 Py_UCS1 *data = writer->data;
13908
Christian Heimesf051e432016-09-13 20:22:02 +020013909 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013910 break;
13911 }
13912 case PyUnicode_2BYTE_KIND:
13913 {
13914 _PyUnicode_CONVERT_BYTES(
13915 Py_UCS1, Py_UCS2,
13916 ascii, ascii + len,
13917 (Py_UCS2 *)writer->data + writer->pos);
13918 break;
13919 }
13920 case PyUnicode_4BYTE_KIND:
13921 {
13922 _PyUnicode_CONVERT_BYTES(
13923 Py_UCS1, Py_UCS4,
13924 ascii, ascii + len,
13925 (Py_UCS4 *)writer->data + writer->pos);
13926 break;
13927 }
13928 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013929 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013930 }
13931
13932 writer->pos += len;
13933 return 0;
13934}
13935
13936int
13937_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13938 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013939{
13940 Py_UCS4 maxchar;
13941
Andy Lestere6be9b52020-02-11 20:28:35 -060013942 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020013943 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13944 return -1;
13945 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13946 writer->pos += len;
13947 return 0;
13948}
13949
Victor Stinnerd3f08822012-05-29 12:57:52 +020013950PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013951_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013952{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013953 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013954
Victor Stinnerd3f08822012-05-29 12:57:52 +020013955 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013956 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013957 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013958 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013959
13960 str = writer->buffer;
13961 writer->buffer = NULL;
13962
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013963 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013964 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13965 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013966 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013967
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013968 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13969 PyObject *str2;
13970 str2 = resize_compact(str, writer->pos);
13971 if (str2 == NULL) {
13972 Py_DECREF(str);
13973 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013974 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013975 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013976 }
13977
Victor Stinner15a0bd32013-07-08 22:29:55 +020013978 assert(_PyUnicode_CheckConsistency(str, 1));
13979 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013980}
13981
Victor Stinnerd3f08822012-05-29 12:57:52 +020013982void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013983_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013984{
13985 Py_CLEAR(writer->buffer);
13986}
13987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013988#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013989
13990PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013991 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013992\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013993Return a formatted version of S, using substitutions from args and kwargs.\n\
13994The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013995
Eric Smith27bbca62010-11-04 17:06:58 +000013996PyDoc_STRVAR(format_map__doc__,
13997 "S.format_map(mapping) -> str\n\
13998\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013999Return a formatted version of S, using substitutions from mapping.\n\
14000The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014001
INADA Naoki3ae20562017-01-16 20:41:20 +090014002/*[clinic input]
14003str.__format__ as unicode___format__
14004
14005 format_spec: unicode
14006 /
14007
14008Return a formatted version of the string as described by format_spec.
14009[clinic start generated code]*/
14010
Eric Smith4a7d76d2008-05-30 18:10:19 +000014011static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014012unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014013/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014014{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014015 _PyUnicodeWriter writer;
14016 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014017
Victor Stinnerd3f08822012-05-29 12:57:52 +020014018 if (PyUnicode_READY(self) == -1)
14019 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014020 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014021 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14022 self, format_spec, 0,
14023 PyUnicode_GET_LENGTH(format_spec));
14024 if (ret == -1) {
14025 _PyUnicodeWriter_Dealloc(&writer);
14026 return NULL;
14027 }
14028 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014029}
14030
INADA Naoki3ae20562017-01-16 20:41:20 +090014031/*[clinic input]
14032str.__sizeof__ as unicode_sizeof
14033
14034Return the size of the string in memory, in bytes.
14035[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014036
14037static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014038unicode_sizeof_impl(PyObject *self)
14039/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014040{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014041 Py_ssize_t size;
14042
14043 /* If it's a compact object, account for base structure +
14044 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014045 if (PyUnicode_IS_COMPACT_ASCII(self))
14046 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14047 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014048 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014049 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014050 else {
14051 /* If it is a two-block object, account for base object, and
14052 for character block if present. */
14053 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014054 if (_PyUnicode_DATA_ANY(self))
14055 size += (PyUnicode_GET_LENGTH(self) + 1) *
14056 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014057 }
14058 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014059 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014060 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14061 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14062 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14063 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014064
14065 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014066}
14067
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014068static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014069unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014070{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014071 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014072 if (!copy)
14073 return NULL;
14074 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014075}
14076
Guido van Rossumd57fd912000-03-10 22:53:23 +000014077static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014078 UNICODE_ENCODE_METHODDEF
14079 UNICODE_REPLACE_METHODDEF
14080 UNICODE_SPLIT_METHODDEF
14081 UNICODE_RSPLIT_METHODDEF
14082 UNICODE_JOIN_METHODDEF
14083 UNICODE_CAPITALIZE_METHODDEF
14084 UNICODE_CASEFOLD_METHODDEF
14085 UNICODE_TITLE_METHODDEF
14086 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014087 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014088 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014089 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014090 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014091 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014092 UNICODE_LJUST_METHODDEF
14093 UNICODE_LOWER_METHODDEF
14094 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014095 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14096 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014097 UNICODE_RJUST_METHODDEF
14098 UNICODE_RSTRIP_METHODDEF
14099 UNICODE_RPARTITION_METHODDEF
14100 UNICODE_SPLITLINES_METHODDEF
14101 UNICODE_STRIP_METHODDEF
14102 UNICODE_SWAPCASE_METHODDEF
14103 UNICODE_TRANSLATE_METHODDEF
14104 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014105 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14106 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090014107 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014108 UNICODE_ISLOWER_METHODDEF
14109 UNICODE_ISUPPER_METHODDEF
14110 UNICODE_ISTITLE_METHODDEF
14111 UNICODE_ISSPACE_METHODDEF
14112 UNICODE_ISDECIMAL_METHODDEF
14113 UNICODE_ISDIGIT_METHODDEF
14114 UNICODE_ISNUMERIC_METHODDEF
14115 UNICODE_ISALPHA_METHODDEF
14116 UNICODE_ISALNUM_METHODDEF
14117 UNICODE_ISIDENTIFIER_METHODDEF
14118 UNICODE_ISPRINTABLE_METHODDEF
14119 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014120 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014121 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014122 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014123 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014124 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014125#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014126 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014127 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014128#endif
14129
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014130 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014131 {NULL, NULL}
14132};
14133
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014134static PyObject *
14135unicode_mod(PyObject *v, PyObject *w)
14136{
Brian Curtindfc80e32011-08-10 20:28:54 -050014137 if (!PyUnicode_Check(v))
14138 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014139 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014140}
14141
14142static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014143 0, /*nb_add*/
14144 0, /*nb_subtract*/
14145 0, /*nb_multiply*/
14146 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014147};
14148
Guido van Rossumd57fd912000-03-10 22:53:23 +000014149static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014150 (lenfunc) unicode_length, /* sq_length */
14151 PyUnicode_Concat, /* sq_concat */
14152 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14153 (ssizeargfunc) unicode_getitem, /* sq_item */
14154 0, /* sq_slice */
14155 0, /* sq_ass_item */
14156 0, /* sq_ass_slice */
14157 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014158};
14159
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014160static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014161unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014162{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014163 if (PyUnicode_READY(self) == -1)
14164 return NULL;
14165
Victor Stinnera15e2602020-04-08 02:01:56 +020014166 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014167 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014168 if (i == -1 && PyErr_Occurred())
14169 return NULL;
14170 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014171 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014172 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014173 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014174 Py_ssize_t start, stop, step, slicelength, i;
14175 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014176 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014177 const void *src_data;
14178 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014179 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014180 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014181
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014182 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014183 return NULL;
14184 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014185 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14186 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014187
14188 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014189 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014190 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014191 slicelength == PyUnicode_GET_LENGTH(self)) {
14192 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014193 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014194 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014195 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014196 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014197 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014198 src_kind = PyUnicode_KIND(self);
14199 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014200 if (!PyUnicode_IS_ASCII(self)) {
14201 kind_limit = kind_maxchar_limit(src_kind);
14202 max_char = 0;
14203 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14204 ch = PyUnicode_READ(src_kind, src_data, cur);
14205 if (ch > max_char) {
14206 max_char = ch;
14207 if (max_char >= kind_limit)
14208 break;
14209 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014210 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014211 }
Victor Stinner55c99112011-10-13 01:17:06 +020014212 else
14213 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014214 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014215 if (result == NULL)
14216 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014217 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014218 dest_data = PyUnicode_DATA(result);
14219
14220 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014221 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14222 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014223 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014224 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014225 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014226 } else {
14227 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14228 return NULL;
14229 }
14230}
14231
14232static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014233 (lenfunc)unicode_length, /* mp_length */
14234 (binaryfunc)unicode_subscript, /* mp_subscript */
14235 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014236};
14237
Guido van Rossumd57fd912000-03-10 22:53:23 +000014238
Guido van Rossumd57fd912000-03-10 22:53:23 +000014239/* Helpers for PyUnicode_Format() */
14240
Victor Stinnera47082312012-10-04 02:19:54 +020014241struct unicode_formatter_t {
14242 PyObject *args;
14243 int args_owned;
14244 Py_ssize_t arglen, argidx;
14245 PyObject *dict;
14246
14247 enum PyUnicode_Kind fmtkind;
14248 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014249 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014250 PyObject *fmtstr;
14251
14252 _PyUnicodeWriter writer;
14253};
14254
14255struct unicode_format_arg_t {
14256 Py_UCS4 ch;
14257 int flags;
14258 Py_ssize_t width;
14259 int prec;
14260 int sign;
14261};
14262
Guido van Rossumd57fd912000-03-10 22:53:23 +000014263static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014264unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014265{
Victor Stinnera47082312012-10-04 02:19:54 +020014266 Py_ssize_t argidx = ctx->argidx;
14267
14268 if (argidx < ctx->arglen) {
14269 ctx->argidx++;
14270 if (ctx->arglen < 0)
14271 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014272 else
Victor Stinnera47082312012-10-04 02:19:54 +020014273 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014274 }
14275 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014276 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014277 return NULL;
14278}
14279
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014280/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014281
Victor Stinnera47082312012-10-04 02:19:54 +020014282/* Format a float into the writer if the writer is not NULL, or into *p_output
14283 otherwise.
14284
14285 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014286static int
Victor Stinnera47082312012-10-04 02:19:54 +020014287formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14288 PyObject **p_output,
14289 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014290{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014291 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014292 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014293 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014294 int prec;
14295 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014296
Guido van Rossumd57fd912000-03-10 22:53:23 +000014297 x = PyFloat_AsDouble(v);
14298 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014299 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014300
Victor Stinnera47082312012-10-04 02:19:54 +020014301 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014302 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014303 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014304
Victor Stinnera47082312012-10-04 02:19:54 +020014305 if (arg->flags & F_ALT)
14306 dtoa_flags = Py_DTSF_ALT;
14307 else
14308 dtoa_flags = 0;
14309 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014310 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014311 return -1;
14312 len = strlen(p);
14313 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014314 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014315 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014316 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014317 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014318 }
14319 else
14320 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014321 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014322 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014323}
14324
Victor Stinnerd0880d52012-04-27 23:40:13 +020014325/* formatlong() emulates the format codes d, u, o, x and X, and
14326 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14327 * Python's regular ints.
14328 * Return value: a new PyUnicodeObject*, or NULL if error.
14329 * The output string is of the form
14330 * "-"? ("0x" | "0X")? digit+
14331 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14332 * set in flags. The case of hex digits will be correct,
14333 * There will be at least prec digits, zero-filled on the left if
14334 * necessary to get that many.
14335 * val object to be converted
14336 * flags bitmask of format flags; only F_ALT is looked at
14337 * prec minimum number of digits; 0-fill on left if needed
14338 * type a character in [duoxX]; u acts the same as d
14339 *
14340 * CAUTION: o, x and X conversions on regular ints can never
14341 * produce a '-' sign, but can for Python's unbounded ints.
14342 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014343PyObject *
14344_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014345{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014346 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014347 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014348 Py_ssize_t i;
14349 int sign; /* 1 if '-', else 0 */
14350 int len; /* number of characters */
14351 Py_ssize_t llen;
14352 int numdigits; /* len == numnondigits + numdigits */
14353 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014354
Victor Stinnerd0880d52012-04-27 23:40:13 +020014355 /* Avoid exceeding SSIZE_T_MAX */
14356 if (prec > INT_MAX-3) {
14357 PyErr_SetString(PyExc_OverflowError,
14358 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014359 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014360 }
14361
14362 assert(PyLong_Check(val));
14363
14364 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014365 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014366 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014367 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014368 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014369 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014370 /* int and int subclasses should print numerically when a numeric */
14371 /* format code is used (see issue18780) */
14372 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014373 break;
14374 case 'o':
14375 numnondigits = 2;
14376 result = PyNumber_ToBase(val, 8);
14377 break;
14378 case 'x':
14379 case 'X':
14380 numnondigits = 2;
14381 result = PyNumber_ToBase(val, 16);
14382 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014383 }
14384 if (!result)
14385 return NULL;
14386
14387 assert(unicode_modifiable(result));
14388 assert(PyUnicode_IS_READY(result));
14389 assert(PyUnicode_IS_ASCII(result));
14390
14391 /* To modify the string in-place, there can only be one reference. */
14392 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014393 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014394 PyErr_BadInternalCall();
14395 return NULL;
14396 }
14397 buf = PyUnicode_DATA(result);
14398 llen = PyUnicode_GET_LENGTH(result);
14399 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014400 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014401 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014402 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014403 return NULL;
14404 }
14405 len = (int)llen;
14406 sign = buf[0] == '-';
14407 numnondigits += sign;
14408 numdigits = len - numnondigits;
14409 assert(numdigits > 0);
14410
14411 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014412 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014413 (type == 'o' || type == 'x' || type == 'X'))) {
14414 assert(buf[sign] == '0');
14415 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14416 buf[sign+1] == 'o');
14417 numnondigits -= 2;
14418 buf += 2;
14419 len -= 2;
14420 if (sign)
14421 buf[0] = '-';
14422 assert(len == numnondigits + numdigits);
14423 assert(numdigits > 0);
14424 }
14425
14426 /* Fill with leading zeroes to meet minimum width. */
14427 if (prec > numdigits) {
14428 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14429 numnondigits + prec);
14430 char *b1;
14431 if (!r1) {
14432 Py_DECREF(result);
14433 return NULL;
14434 }
14435 b1 = PyBytes_AS_STRING(r1);
14436 for (i = 0; i < numnondigits; ++i)
14437 *b1++ = *buf++;
14438 for (i = 0; i < prec - numdigits; i++)
14439 *b1++ = '0';
14440 for (i = 0; i < numdigits; i++)
14441 *b1++ = *buf++;
14442 *b1 = '\0';
14443 Py_DECREF(result);
14444 result = r1;
14445 buf = PyBytes_AS_STRING(result);
14446 len = numnondigits + prec;
14447 }
14448
14449 /* Fix up case for hex conversions. */
14450 if (type == 'X') {
14451 /* Need to convert all lower case letters to upper case.
14452 and need to convert 0x to 0X (and -0x to -0X). */
14453 for (i = 0; i < len; i++)
14454 if (buf[i] >= 'a' && buf[i] <= 'x')
14455 buf[i] -= 'a'-'A';
14456 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014457 if (!PyUnicode_Check(result)
14458 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014459 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014460 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014461 Py_DECREF(result);
14462 result = unicode;
14463 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014464 else if (len != PyUnicode_GET_LENGTH(result)) {
14465 if (PyUnicode_Resize(&result, len) < 0)
14466 Py_CLEAR(result);
14467 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014468 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014469}
14470
Ethan Furmandf3ed242014-01-05 06:50:30 -080014471/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014472 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014473 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014474 * -1 and raise an exception on error */
14475static int
Victor Stinnera47082312012-10-04 02:19:54 +020014476mainformatlong(PyObject *v,
14477 struct unicode_format_arg_t *arg,
14478 PyObject **p_output,
14479 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014480{
14481 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014482 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014483
14484 if (!PyNumber_Check(v))
14485 goto wrongtype;
14486
Ethan Furman9ab74802014-03-21 06:38:46 -070014487 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014488 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014489 if (type == 'o' || type == 'x' || type == 'X') {
14490 iobj = PyNumber_Index(v);
14491 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014492 if (PyErr_ExceptionMatches(PyExc_TypeError))
14493 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014494 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014495 }
14496 }
14497 else {
14498 iobj = PyNumber_Long(v);
14499 if (iobj == NULL ) {
14500 if (PyErr_ExceptionMatches(PyExc_TypeError))
14501 goto wrongtype;
14502 return -1;
14503 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014504 }
14505 assert(PyLong_Check(iobj));
14506 }
14507 else {
14508 iobj = v;
14509 Py_INCREF(iobj);
14510 }
14511
14512 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014513 && arg->width == -1 && arg->prec == -1
14514 && !(arg->flags & (F_SIGN | F_BLANK))
14515 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014516 {
14517 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014518 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014519 int base;
14520
Victor Stinnera47082312012-10-04 02:19:54 +020014521 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014522 {
14523 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014524 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014525 case 'd':
14526 case 'i':
14527 case 'u':
14528 base = 10;
14529 break;
14530 case 'o':
14531 base = 8;
14532 break;
14533 case 'x':
14534 case 'X':
14535 base = 16;
14536 break;
14537 }
14538
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014539 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14540 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014541 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014542 }
14543 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014544 return 1;
14545 }
14546
Ethan Furmanb95b5612015-01-23 20:05:18 -080014547 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014548 Py_DECREF(iobj);
14549 if (res == NULL)
14550 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014551 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014552 return 0;
14553
14554wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014555 switch(type)
14556 {
14557 case 'o':
14558 case 'x':
14559 case 'X':
14560 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014561 "%%%c format: an integer is required, "
14562 "not %.200s",
14563 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014564 break;
14565 default:
14566 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014567 "%%%c format: a number is required, "
14568 "not %.200s",
14569 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014570 break;
14571 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014572 return -1;
14573}
14574
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014575static Py_UCS4
14576formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014577{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014578 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014579 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014580 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014581 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014582 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014583 goto onError;
14584 }
14585 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014586 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014587 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014588 /* make sure number is a type of integer */
14589 if (!PyLong_Check(v)) {
14590 iobj = PyNumber_Index(v);
14591 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014592 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014593 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014594 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014595 Py_DECREF(iobj);
14596 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014597 else {
14598 x = PyLong_AsLong(v);
14599 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014600 if (x == -1 && PyErr_Occurred())
14601 goto onError;
14602
Victor Stinner8faf8212011-12-08 22:14:11 +010014603 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014604 PyErr_SetString(PyExc_OverflowError,
14605 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014606 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014607 }
14608
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014609 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014610 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014611
Benjamin Peterson29060642009-01-31 22:14:21 +000014612 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014613 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014614 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014615 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014616}
14617
Victor Stinnera47082312012-10-04 02:19:54 +020014618/* Parse options of an argument: flags, width, precision.
14619 Handle also "%(name)" syntax.
14620
14621 Return 0 if the argument has been formatted into arg->str.
14622 Return 1 if the argument has been written into ctx->writer,
14623 Raise an exception and return -1 on error. */
14624static int
14625unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14626 struct unicode_format_arg_t *arg)
14627{
14628#define FORMAT_READ(ctx) \
14629 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14630
14631 PyObject *v;
14632
Victor Stinnera47082312012-10-04 02:19:54 +020014633 if (arg->ch == '(') {
14634 /* Get argument value from a dictionary. Example: "%(name)s". */
14635 Py_ssize_t keystart;
14636 Py_ssize_t keylen;
14637 PyObject *key;
14638 int pcount = 1;
14639
14640 if (ctx->dict == NULL) {
14641 PyErr_SetString(PyExc_TypeError,
14642 "format requires a mapping");
14643 return -1;
14644 }
14645 ++ctx->fmtpos;
14646 --ctx->fmtcnt;
14647 keystart = ctx->fmtpos;
14648 /* Skip over balanced parentheses */
14649 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14650 arg->ch = FORMAT_READ(ctx);
14651 if (arg->ch == ')')
14652 --pcount;
14653 else if (arg->ch == '(')
14654 ++pcount;
14655 ctx->fmtpos++;
14656 }
14657 keylen = ctx->fmtpos - keystart - 1;
14658 if (ctx->fmtcnt < 0 || pcount > 0) {
14659 PyErr_SetString(PyExc_ValueError,
14660 "incomplete format key");
14661 return -1;
14662 }
14663 key = PyUnicode_Substring(ctx->fmtstr,
14664 keystart, keystart + keylen);
14665 if (key == NULL)
14666 return -1;
14667 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014668 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014669 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014670 }
14671 ctx->args = PyObject_GetItem(ctx->dict, key);
14672 Py_DECREF(key);
14673 if (ctx->args == NULL)
14674 return -1;
14675 ctx->args_owned = 1;
14676 ctx->arglen = -1;
14677 ctx->argidx = -2;
14678 }
14679
14680 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014681 while (--ctx->fmtcnt >= 0) {
14682 arg->ch = FORMAT_READ(ctx);
14683 ctx->fmtpos++;
14684 switch (arg->ch) {
14685 case '-': arg->flags |= F_LJUST; continue;
14686 case '+': arg->flags |= F_SIGN; continue;
14687 case ' ': arg->flags |= F_BLANK; continue;
14688 case '#': arg->flags |= F_ALT; continue;
14689 case '0': arg->flags |= F_ZERO; continue;
14690 }
14691 break;
14692 }
14693
14694 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014695 if (arg->ch == '*') {
14696 v = unicode_format_getnextarg(ctx);
14697 if (v == NULL)
14698 return -1;
14699 if (!PyLong_Check(v)) {
14700 PyErr_SetString(PyExc_TypeError,
14701 "* wants int");
14702 return -1;
14703 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014704 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014705 if (arg->width == -1 && PyErr_Occurred())
14706 return -1;
14707 if (arg->width < 0) {
14708 arg->flags |= F_LJUST;
14709 arg->width = -arg->width;
14710 }
14711 if (--ctx->fmtcnt >= 0) {
14712 arg->ch = FORMAT_READ(ctx);
14713 ctx->fmtpos++;
14714 }
14715 }
14716 else if (arg->ch >= '0' && arg->ch <= '9') {
14717 arg->width = arg->ch - '0';
14718 while (--ctx->fmtcnt >= 0) {
14719 arg->ch = FORMAT_READ(ctx);
14720 ctx->fmtpos++;
14721 if (arg->ch < '0' || arg->ch > '9')
14722 break;
14723 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14724 mixing signed and unsigned comparison. Since arg->ch is between
14725 '0' and '9', casting to int is safe. */
14726 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14727 PyErr_SetString(PyExc_ValueError,
14728 "width too big");
14729 return -1;
14730 }
14731 arg->width = arg->width*10 + (arg->ch - '0');
14732 }
14733 }
14734
14735 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014736 if (arg->ch == '.') {
14737 arg->prec = 0;
14738 if (--ctx->fmtcnt >= 0) {
14739 arg->ch = FORMAT_READ(ctx);
14740 ctx->fmtpos++;
14741 }
14742 if (arg->ch == '*') {
14743 v = unicode_format_getnextarg(ctx);
14744 if (v == NULL)
14745 return -1;
14746 if (!PyLong_Check(v)) {
14747 PyErr_SetString(PyExc_TypeError,
14748 "* wants int");
14749 return -1;
14750 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014751 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014752 if (arg->prec == -1 && PyErr_Occurred())
14753 return -1;
14754 if (arg->prec < 0)
14755 arg->prec = 0;
14756 if (--ctx->fmtcnt >= 0) {
14757 arg->ch = FORMAT_READ(ctx);
14758 ctx->fmtpos++;
14759 }
14760 }
14761 else if (arg->ch >= '0' && arg->ch <= '9') {
14762 arg->prec = arg->ch - '0';
14763 while (--ctx->fmtcnt >= 0) {
14764 arg->ch = FORMAT_READ(ctx);
14765 ctx->fmtpos++;
14766 if (arg->ch < '0' || arg->ch > '9')
14767 break;
14768 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14769 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014770 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014771 return -1;
14772 }
14773 arg->prec = arg->prec*10 + (arg->ch - '0');
14774 }
14775 }
14776 }
14777
14778 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14779 if (ctx->fmtcnt >= 0) {
14780 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14781 if (--ctx->fmtcnt >= 0) {
14782 arg->ch = FORMAT_READ(ctx);
14783 ctx->fmtpos++;
14784 }
14785 }
14786 }
14787 if (ctx->fmtcnt < 0) {
14788 PyErr_SetString(PyExc_ValueError,
14789 "incomplete format");
14790 return -1;
14791 }
14792 return 0;
14793
14794#undef FORMAT_READ
14795}
14796
14797/* Format one argument. Supported conversion specifiers:
14798
14799 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014800 - "i", "d", "u": int or float
14801 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014802 - "e", "E", "f", "F", "g", "G": float
14803 - "c": int or str (1 character)
14804
Victor Stinner8dbd4212012-12-04 09:30:24 +010014805 When possible, the output is written directly into the Unicode writer
14806 (ctx->writer). A string is created when padding is required.
14807
Victor Stinnera47082312012-10-04 02:19:54 +020014808 Return 0 if the argument has been formatted into *p_str,
14809 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014810 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014811static int
14812unicode_format_arg_format(struct unicode_formatter_t *ctx,
14813 struct unicode_format_arg_t *arg,
14814 PyObject **p_str)
14815{
14816 PyObject *v;
14817 _PyUnicodeWriter *writer = &ctx->writer;
14818
14819 if (ctx->fmtcnt == 0)
14820 ctx->writer.overallocate = 0;
14821
Victor Stinnera47082312012-10-04 02:19:54 +020014822 v = unicode_format_getnextarg(ctx);
14823 if (v == NULL)
14824 return -1;
14825
Victor Stinnera47082312012-10-04 02:19:54 +020014826
14827 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014828 case 's':
14829 case 'r':
14830 case 'a':
14831 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14832 /* Fast path */
14833 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14834 return -1;
14835 return 1;
14836 }
14837
14838 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14839 *p_str = v;
14840 Py_INCREF(*p_str);
14841 }
14842 else {
14843 if (arg->ch == 's')
14844 *p_str = PyObject_Str(v);
14845 else if (arg->ch == 'r')
14846 *p_str = PyObject_Repr(v);
14847 else
14848 *p_str = PyObject_ASCII(v);
14849 }
14850 break;
14851
14852 case 'i':
14853 case 'd':
14854 case 'u':
14855 case 'o':
14856 case 'x':
14857 case 'X':
14858 {
14859 int ret = mainformatlong(v, arg, p_str, writer);
14860 if (ret != 0)
14861 return ret;
14862 arg->sign = 1;
14863 break;
14864 }
14865
14866 case 'e':
14867 case 'E':
14868 case 'f':
14869 case 'F':
14870 case 'g':
14871 case 'G':
14872 if (arg->width == -1 && arg->prec == -1
14873 && !(arg->flags & (F_SIGN | F_BLANK)))
14874 {
14875 /* Fast path */
14876 if (formatfloat(v, arg, NULL, writer) == -1)
14877 return -1;
14878 return 1;
14879 }
14880
14881 arg->sign = 1;
14882 if (formatfloat(v, arg, p_str, NULL) == -1)
14883 return -1;
14884 break;
14885
14886 case 'c':
14887 {
14888 Py_UCS4 ch = formatchar(v);
14889 if (ch == (Py_UCS4) -1)
14890 return -1;
14891 if (arg->width == -1 && arg->prec == -1) {
14892 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014893 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014894 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014895 return 1;
14896 }
14897 *p_str = PyUnicode_FromOrdinal(ch);
14898 break;
14899 }
14900
14901 default:
14902 PyErr_Format(PyExc_ValueError,
14903 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014904 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014905 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14906 (int)arg->ch,
14907 ctx->fmtpos - 1);
14908 return -1;
14909 }
14910 if (*p_str == NULL)
14911 return -1;
14912 assert (PyUnicode_Check(*p_str));
14913 return 0;
14914}
14915
14916static int
14917unicode_format_arg_output(struct unicode_formatter_t *ctx,
14918 struct unicode_format_arg_t *arg,
14919 PyObject *str)
14920{
14921 Py_ssize_t len;
14922 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014923 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020014924 Py_ssize_t pindex;
14925 Py_UCS4 signchar;
14926 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014927 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014928 Py_ssize_t sublen;
14929 _PyUnicodeWriter *writer = &ctx->writer;
14930 Py_UCS4 fill;
14931
14932 fill = ' ';
14933 if (arg->sign && arg->flags & F_ZERO)
14934 fill = '0';
14935
14936 if (PyUnicode_READY(str) == -1)
14937 return -1;
14938
14939 len = PyUnicode_GET_LENGTH(str);
14940 if ((arg->width == -1 || arg->width <= len)
14941 && (arg->prec == -1 || arg->prec >= len)
14942 && !(arg->flags & (F_SIGN | F_BLANK)))
14943 {
14944 /* Fast path */
14945 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14946 return -1;
14947 return 0;
14948 }
14949
14950 /* Truncate the string for "s", "r" and "a" formats
14951 if the precision is set */
14952 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14953 if (arg->prec >= 0 && len > arg->prec)
14954 len = arg->prec;
14955 }
14956
14957 /* Adjust sign and width */
14958 kind = PyUnicode_KIND(str);
14959 pbuf = PyUnicode_DATA(str);
14960 pindex = 0;
14961 signchar = '\0';
14962 if (arg->sign) {
14963 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14964 if (ch == '-' || ch == '+') {
14965 signchar = ch;
14966 len--;
14967 pindex++;
14968 }
14969 else if (arg->flags & F_SIGN)
14970 signchar = '+';
14971 else if (arg->flags & F_BLANK)
14972 signchar = ' ';
14973 else
14974 arg->sign = 0;
14975 }
14976 if (arg->width < len)
14977 arg->width = len;
14978
14979 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014980 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014981 if (!(arg->flags & F_LJUST)) {
14982 if (arg->sign) {
14983 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014984 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014985 }
14986 else {
14987 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014988 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014989 }
14990 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014991 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14992 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014993 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014994 }
14995
Victor Stinnera47082312012-10-04 02:19:54 +020014996 buflen = arg->width;
14997 if (arg->sign && len == arg->width)
14998 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014999 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015000 return -1;
15001
15002 /* Write the sign if needed */
15003 if (arg->sign) {
15004 if (fill != ' ') {
15005 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15006 writer->pos += 1;
15007 }
15008 if (arg->width > len)
15009 arg->width--;
15010 }
15011
15012 /* Write the numeric prefix for "x", "X" and "o" formats
15013 if the alternate form is used.
15014 For example, write "0x" for the "%#x" format. */
15015 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15016 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15017 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15018 if (fill != ' ') {
15019 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15020 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15021 writer->pos += 2;
15022 pindex += 2;
15023 }
15024 arg->width -= 2;
15025 if (arg->width < 0)
15026 arg->width = 0;
15027 len -= 2;
15028 }
15029
15030 /* Pad left with the fill character if needed */
15031 if (arg->width > len && !(arg->flags & F_LJUST)) {
15032 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015033 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015034 writer->pos += sublen;
15035 arg->width = len;
15036 }
15037
15038 /* If padding with spaces: write sign if needed and/or numeric prefix if
15039 the alternate form is used */
15040 if (fill == ' ') {
15041 if (arg->sign) {
15042 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15043 writer->pos += 1;
15044 }
15045 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15046 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15047 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15048 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15049 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15050 writer->pos += 2;
15051 pindex += 2;
15052 }
15053 }
15054
15055 /* Write characters */
15056 if (len) {
15057 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15058 str, pindex, len);
15059 writer->pos += len;
15060 }
15061
15062 /* Pad right with the fill character if needed */
15063 if (arg->width > len) {
15064 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015065 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015066 writer->pos += sublen;
15067 }
15068 return 0;
15069}
15070
15071/* Helper of PyUnicode_Format(): format one arg.
15072 Return 0 on success, raise an exception and return -1 on error. */
15073static int
15074unicode_format_arg(struct unicode_formatter_t *ctx)
15075{
15076 struct unicode_format_arg_t arg;
15077 PyObject *str;
15078 int ret;
15079
Victor Stinner8dbd4212012-12-04 09:30:24 +010015080 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015081 if (arg.ch == '%') {
15082 ctx->fmtpos++;
15083 ctx->fmtcnt--;
15084 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15085 return -1;
15086 return 0;
15087 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015088 arg.flags = 0;
15089 arg.width = -1;
15090 arg.prec = -1;
15091 arg.sign = 0;
15092 str = NULL;
15093
Victor Stinnera47082312012-10-04 02:19:54 +020015094 ret = unicode_format_arg_parse(ctx, &arg);
15095 if (ret == -1)
15096 return -1;
15097
15098 ret = unicode_format_arg_format(ctx, &arg, &str);
15099 if (ret == -1)
15100 return -1;
15101
15102 if (ret != 1) {
15103 ret = unicode_format_arg_output(ctx, &arg, str);
15104 Py_DECREF(str);
15105 if (ret == -1)
15106 return -1;
15107 }
15108
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015109 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015110 PyErr_SetString(PyExc_TypeError,
15111 "not all arguments converted during string formatting");
15112 return -1;
15113 }
15114 return 0;
15115}
15116
Alexander Belopolsky40018472011-02-26 01:02:56 +000015117PyObject *
15118PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015119{
Victor Stinnera47082312012-10-04 02:19:54 +020015120 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015121
Guido van Rossumd57fd912000-03-10 22:53:23 +000015122 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015123 PyErr_BadInternalCall();
15124 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015125 }
Victor Stinnera47082312012-10-04 02:19:54 +020015126
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015127 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015128 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015129
15130 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015131 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15132 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15133 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15134 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015135
Victor Stinner8f674cc2013-04-17 23:02:17 +020015136 _PyUnicodeWriter_Init(&ctx.writer);
15137 ctx.writer.min_length = ctx.fmtcnt + 100;
15138 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015139
Guido van Rossumd57fd912000-03-10 22:53:23 +000015140 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015141 ctx.arglen = PyTuple_Size(args);
15142 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015143 }
15144 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015145 ctx.arglen = -1;
15146 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015147 }
Victor Stinnera47082312012-10-04 02:19:54 +020015148 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015149 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015150 ctx.dict = args;
15151 else
15152 ctx.dict = NULL;
15153 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015154
Victor Stinnera47082312012-10-04 02:19:54 +020015155 while (--ctx.fmtcnt >= 0) {
15156 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015157 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015158
15159 nonfmtpos = ctx.fmtpos++;
15160 while (ctx.fmtcnt >= 0 &&
15161 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15162 ctx.fmtpos++;
15163 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015164 }
Victor Stinnera47082312012-10-04 02:19:54 +020015165 if (ctx.fmtcnt < 0) {
15166 ctx.fmtpos--;
15167 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015168 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015169
Victor Stinnercfc4c132013-04-03 01:48:39 +020015170 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15171 nonfmtpos, ctx.fmtpos) < 0)
15172 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015173 }
15174 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015175 ctx.fmtpos++;
15176 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015177 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015178 }
15179 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015180
Victor Stinnera47082312012-10-04 02:19:54 +020015181 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015182 PyErr_SetString(PyExc_TypeError,
15183 "not all arguments converted during string formatting");
15184 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015185 }
15186
Victor Stinnera47082312012-10-04 02:19:54 +020015187 if (ctx.args_owned) {
15188 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015189 }
Victor Stinnera47082312012-10-04 02:19:54 +020015190 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015191
Benjamin Peterson29060642009-01-31 22:14:21 +000015192 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015193 _PyUnicodeWriter_Dealloc(&ctx.writer);
15194 if (ctx.args_owned) {
15195 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015196 }
15197 return NULL;
15198}
15199
Jeremy Hylton938ace62002-07-17 16:30:39 +000015200static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015201unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15202
Tim Peters6d6c1a32001-08-02 04:15:00 +000015203static PyObject *
15204unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15205{
Benjamin Peterson29060642009-01-31 22:14:21 +000015206 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015207 static char *kwlist[] = {"object", "encoding", "errors", 0};
15208 char *encoding = NULL;
15209 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015210
Benjamin Peterson14339b62009-01-31 16:36:08 +000015211 if (type != &PyUnicode_Type)
15212 return unicode_subtype_new(type, args, kwds);
15213 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015214 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015215 return NULL;
15216 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015217 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015218 if (encoding == NULL && errors == NULL)
15219 return PyObject_Str(x);
15220 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015221 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015222}
15223
Guido van Rossume023fe02001-08-30 03:12:59 +000015224static PyObject *
15225unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15226{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015227 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015228 Py_ssize_t length, char_size;
15229 int share_wstr, share_utf8;
15230 unsigned int kind;
15231 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015232
Benjamin Peterson14339b62009-01-31 16:36:08 +000015233 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015234
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015235 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015236 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015237 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015238 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015239 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015240 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015241 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015242 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015243
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015244 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015245 if (self == NULL) {
15246 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015247 return NULL;
15248 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015249 kind = PyUnicode_KIND(unicode);
15250 length = PyUnicode_GET_LENGTH(unicode);
15251
15252 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015253#ifdef Py_DEBUG
15254 _PyUnicode_HASH(self) = -1;
15255#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015256 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015257#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015258 _PyUnicode_STATE(self).interned = 0;
15259 _PyUnicode_STATE(self).kind = kind;
15260 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015261 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015262 _PyUnicode_STATE(self).ready = 1;
15263 _PyUnicode_WSTR(self) = NULL;
15264 _PyUnicode_UTF8_LENGTH(self) = 0;
15265 _PyUnicode_UTF8(self) = NULL;
15266 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015267 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015268
15269 share_utf8 = 0;
15270 share_wstr = 0;
15271 if (kind == PyUnicode_1BYTE_KIND) {
15272 char_size = 1;
15273 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15274 share_utf8 = 1;
15275 }
15276 else if (kind == PyUnicode_2BYTE_KIND) {
15277 char_size = 2;
15278 if (sizeof(wchar_t) == 2)
15279 share_wstr = 1;
15280 }
15281 else {
15282 assert(kind == PyUnicode_4BYTE_KIND);
15283 char_size = 4;
15284 if (sizeof(wchar_t) == 4)
15285 share_wstr = 1;
15286 }
15287
15288 /* Ensure we won't overflow the length. */
15289 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15290 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015291 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015292 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015293 data = PyObject_MALLOC((length + 1) * char_size);
15294 if (data == NULL) {
15295 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015296 goto onError;
15297 }
15298
Victor Stinnerc3c74152011-10-02 20:39:55 +020015299 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015300 if (share_utf8) {
15301 _PyUnicode_UTF8_LENGTH(self) = length;
15302 _PyUnicode_UTF8(self) = data;
15303 }
15304 if (share_wstr) {
15305 _PyUnicode_WSTR_LENGTH(self) = length;
15306 _PyUnicode_WSTR(self) = (wchar_t *)data;
15307 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015308
Christian Heimesf051e432016-09-13 20:22:02 +020015309 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015310 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015311 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015312#ifdef Py_DEBUG
15313 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15314#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015315 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015316 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015317
15318onError:
15319 Py_DECREF(unicode);
15320 Py_DECREF(self);
15321 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015322}
15323
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015324PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015325"str(object='') -> str\n\
15326str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015327\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015328Create a new string object from the given object. If encoding or\n\
15329errors is specified, then the object must expose a data buffer\n\
15330that will be decoded using the given encoding and error handler.\n\
15331Otherwise, returns the result of object.__str__() (if defined)\n\
15332or repr(object).\n\
15333encoding defaults to sys.getdefaultencoding().\n\
15334errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015335
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015336static PyObject *unicode_iter(PyObject *seq);
15337
Guido van Rossumd57fd912000-03-10 22:53:23 +000015338PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015339 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015340 "str", /* tp_name */
15341 sizeof(PyUnicodeObject), /* tp_basicsize */
15342 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015343 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015344 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015345 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015346 0, /* tp_getattr */
15347 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015348 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015349 unicode_repr, /* tp_repr */
15350 &unicode_as_number, /* tp_as_number */
15351 &unicode_as_sequence, /* tp_as_sequence */
15352 &unicode_as_mapping, /* tp_as_mapping */
15353 (hashfunc) unicode_hash, /* tp_hash*/
15354 0, /* tp_call*/
15355 (reprfunc) unicode_str, /* tp_str */
15356 PyObject_GenericGetAttr, /* tp_getattro */
15357 0, /* tp_setattro */
15358 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015359 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015360 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15361 unicode_doc, /* tp_doc */
15362 0, /* tp_traverse */
15363 0, /* tp_clear */
15364 PyUnicode_RichCompare, /* tp_richcompare */
15365 0, /* tp_weaklistoffset */
15366 unicode_iter, /* tp_iter */
15367 0, /* tp_iternext */
15368 unicode_methods, /* tp_methods */
15369 0, /* tp_members */
15370 0, /* tp_getset */
15371 &PyBaseObject_Type, /* tp_base */
15372 0, /* tp_dict */
15373 0, /* tp_descr_get */
15374 0, /* tp_descr_set */
15375 0, /* tp_dictoffset */
15376 0, /* tp_init */
15377 0, /* tp_alloc */
15378 unicode_new, /* tp_new */
15379 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015380};
15381
15382/* Initialize the Unicode implementation */
15383
Victor Stinner331a6a52019-05-27 16:39:22 +020015384PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015385_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015386{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015387 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015388 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015389 0x000A, /* LINE FEED */
15390 0x000D, /* CARRIAGE RETURN */
15391 0x001C, /* FILE SEPARATOR */
15392 0x001D, /* GROUP SEPARATOR */
15393 0x001E, /* RECORD SEPARATOR */
15394 0x0085, /* NEXT LINE */
15395 0x2028, /* LINE SEPARATOR */
15396 0x2029, /* PARAGRAPH SEPARATOR */
15397 };
15398
Fred Drakee4315f52000-05-09 19:53:39 +000015399 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015400 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015401 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015402 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015403 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015404 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015405
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015406 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015407 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015408 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015409
15410 /* initialize the linebreak bloom filter */
15411 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015412 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015413 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015414
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015415 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015416 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015417 }
15418 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015419 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015420 }
15421 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015422 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015423 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015424 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015425}
15426
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015427
Walter Dörwald16807132007-05-25 13:52:07 +000015428void
15429PyUnicode_InternInPlace(PyObject **p)
15430{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015431 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015432 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015433#ifdef Py_DEBUG
15434 assert(s != NULL);
15435 assert(_PyUnicode_CHECK(s));
15436#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015437 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015438 return;
15439#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015440 /* If it's a subclass, we don't really know what putting
15441 it in the interned dict might do. */
15442 if (!PyUnicode_CheckExact(s))
15443 return;
15444 if (PyUnicode_CHECK_INTERNED(s))
15445 return;
15446 if (interned == NULL) {
15447 interned = PyDict_New();
15448 if (interned == NULL) {
15449 PyErr_Clear(); /* Don't leave an exception */
15450 return;
15451 }
15452 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015453 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015454 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015455 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015456 if (t == NULL) {
15457 PyErr_Clear();
15458 return;
15459 }
15460 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015461 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015462 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015463 return;
15464 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015465 /* The two references in interned are not counted by refcnt.
15466 The deallocator will take care of this */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015467 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015468 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015469}
15470
15471void
15472PyUnicode_InternImmortal(PyObject **p)
15473{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015474 PyUnicode_InternInPlace(p);
15475 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015476 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015477 Py_INCREF(*p);
15478 }
Walter Dörwald16807132007-05-25 13:52:07 +000015479}
15480
15481PyObject *
15482PyUnicode_InternFromString(const char *cp)
15483{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015484 PyObject *s = PyUnicode_FromString(cp);
15485 if (s == NULL)
15486 return NULL;
15487 PyUnicode_InternInPlace(&s);
15488 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015489}
15490
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015491
15492#if defined(WITH_VALGRIND) || defined(__INSURE__)
15493static void
15494unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015495{
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015496 if (interned == NULL || !PyDict_Check(interned)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015497 return;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015498 }
15499 PyObject *keys = PyDict_Keys(interned);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015500 if (keys == NULL || !PyList_Check(keys)) {
15501 PyErr_Clear();
15502 return;
15503 }
Walter Dörwald16807132007-05-25 13:52:07 +000015504
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015505 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015506 detector, interned unicode strings are not forcibly deallocated;
15507 rather, we give them their stolen references back, and then clear
15508 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015509
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015510 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015511#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015512 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015513 n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015514
15515 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015516#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015517 for (Py_ssize_t i = 0; i < n; i++) {
15518 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015519 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015520 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015521 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015522 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015523 case SSTATE_INTERNED_IMMORTAL:
15524 Py_REFCNT(s) += 1;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015525#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015526 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015527#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015528 break;
15529 case SSTATE_INTERNED_MORTAL:
15530 Py_REFCNT(s) += 2;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015531#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015532 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015533#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015534 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015535 case SSTATE_NOT_INTERNED:
15536 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015537 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015538 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015539 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015540 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015541 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015542#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015543 fprintf(stderr, "total size of all interned strings: "
15544 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15545 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015546#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015547 Py_DECREF(keys);
15548 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015549 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015550}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015551#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015552
15553
15554/********************* Unicode Iterator **************************/
15555
15556typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015557 PyObject_HEAD
15558 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015559 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015560} unicodeiterobject;
15561
15562static void
15563unicodeiter_dealloc(unicodeiterobject *it)
15564{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015565 _PyObject_GC_UNTRACK(it);
15566 Py_XDECREF(it->it_seq);
15567 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015568}
15569
15570static int
15571unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15572{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015573 Py_VISIT(it->it_seq);
15574 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015575}
15576
15577static PyObject *
15578unicodeiter_next(unicodeiterobject *it)
15579{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015580 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015581
Benjamin Peterson14339b62009-01-31 16:36:08 +000015582 assert(it != NULL);
15583 seq = it->it_seq;
15584 if (seq == NULL)
15585 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015586 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015587
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015588 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15589 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015590 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015591 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15592 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015593 if (item != NULL)
15594 ++it->it_index;
15595 return item;
15596 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015597
Benjamin Peterson14339b62009-01-31 16:36:08 +000015598 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015599 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015600 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015601}
15602
15603static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015604unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015605{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015606 Py_ssize_t len = 0;
15607 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015608 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015609 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015610}
15611
15612PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15613
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015614static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015615unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015616{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015617 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015618 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015619 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015620 it->it_seq, it->it_index);
15621 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015622 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015623 if (u == NULL)
15624 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015625 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015626 }
15627}
15628
15629PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15630
15631static PyObject *
15632unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15633{
15634 Py_ssize_t index = PyLong_AsSsize_t(state);
15635 if (index == -1 && PyErr_Occurred())
15636 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015637 if (it->it_seq != NULL) {
15638 if (index < 0)
15639 index = 0;
15640 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15641 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15642 it->it_index = index;
15643 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015644 Py_RETURN_NONE;
15645}
15646
15647PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15648
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015649static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015650 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015651 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015652 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15653 reduce_doc},
15654 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15655 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015656 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015657};
15658
15659PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015660 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15661 "str_iterator", /* tp_name */
15662 sizeof(unicodeiterobject), /* tp_basicsize */
15663 0, /* tp_itemsize */
15664 /* methods */
15665 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015666 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015667 0, /* tp_getattr */
15668 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015669 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015670 0, /* tp_repr */
15671 0, /* tp_as_number */
15672 0, /* tp_as_sequence */
15673 0, /* tp_as_mapping */
15674 0, /* tp_hash */
15675 0, /* tp_call */
15676 0, /* tp_str */
15677 PyObject_GenericGetAttr, /* tp_getattro */
15678 0, /* tp_setattro */
15679 0, /* tp_as_buffer */
15680 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15681 0, /* tp_doc */
15682 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15683 0, /* tp_clear */
15684 0, /* tp_richcompare */
15685 0, /* tp_weaklistoffset */
15686 PyObject_SelfIter, /* tp_iter */
15687 (iternextfunc)unicodeiter_next, /* tp_iternext */
15688 unicodeiter_methods, /* tp_methods */
15689 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015690};
15691
15692static PyObject *
15693unicode_iter(PyObject *seq)
15694{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015695 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015696
Benjamin Peterson14339b62009-01-31 16:36:08 +000015697 if (!PyUnicode_Check(seq)) {
15698 PyErr_BadInternalCall();
15699 return NULL;
15700 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015701 if (PyUnicode_READY(seq) == -1)
15702 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015703 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15704 if (it == NULL)
15705 return NULL;
15706 it->it_index = 0;
15707 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015708 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015709 _PyObject_GC_TRACK(it);
15710 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015711}
15712
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015713
15714size_t
15715Py_UNICODE_strlen(const Py_UNICODE *u)
15716{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015717 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015718}
15719
15720Py_UNICODE*
15721Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15722{
15723 Py_UNICODE *u = s1;
15724 while ((*u++ = *s2++));
15725 return s1;
15726}
15727
15728Py_UNICODE*
15729Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15730{
15731 Py_UNICODE *u = s1;
15732 while ((*u++ = *s2++))
15733 if (n-- == 0)
15734 break;
15735 return s1;
15736}
15737
15738Py_UNICODE*
15739Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15740{
15741 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015742 u1 += wcslen(u1);
15743 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015744 return s1;
15745}
15746
15747int
15748Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15749{
15750 while (*s1 && *s2 && *s1 == *s2)
15751 s1++, s2++;
15752 if (*s1 && *s2)
15753 return (*s1 < *s2) ? -1 : +1;
15754 if (*s1)
15755 return 1;
15756 if (*s2)
15757 return -1;
15758 return 0;
15759}
15760
15761int
15762Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15763{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015764 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015765 for (; n != 0; n--) {
15766 u1 = *s1;
15767 u2 = *s2;
15768 if (u1 != u2)
15769 return (u1 < u2) ? -1 : +1;
15770 if (u1 == '\0')
15771 return 0;
15772 s1++;
15773 s2++;
15774 }
15775 return 0;
15776}
15777
15778Py_UNICODE*
15779Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15780{
15781 const Py_UNICODE *p;
15782 for (p = s; *p; p++)
15783 if (*p == c)
15784 return (Py_UNICODE*)p;
15785 return NULL;
15786}
15787
15788Py_UNICODE*
15789Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15790{
15791 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015792 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015793 while (p != s) {
15794 p--;
15795 if (*p == c)
15796 return (Py_UNICODE*)p;
15797 }
15798 return NULL;
15799}
Victor Stinner331ea922010-08-10 16:37:20 +000015800
Victor Stinner71133ff2010-09-01 23:43:53 +000015801Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015802PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015803{
Victor Stinner577db2c2011-10-11 22:12:48 +020015804 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015805 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015807 if (!PyUnicode_Check(unicode)) {
15808 PyErr_BadArgument();
15809 return NULL;
15810 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015811 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015812 if (u == NULL)
15813 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015814 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015815 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015816 PyErr_NoMemory();
15817 return NULL;
15818 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015819 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015820 size *= sizeof(Py_UNICODE);
15821 copy = PyMem_Malloc(size);
15822 if (copy == NULL) {
15823 PyErr_NoMemory();
15824 return NULL;
15825 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015826 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015827 return copy;
15828}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015829
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015830
Victor Stinner709d23d2019-05-02 14:56:30 -040015831static int
15832encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015833{
Victor Stinner709d23d2019-05-02 14:56:30 -040015834 int res;
15835 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15836 if (res == -2) {
15837 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15838 return -1;
15839 }
15840 if (res < 0) {
15841 PyErr_NoMemory();
15842 return -1;
15843 }
15844 return 0;
15845}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015846
Victor Stinner709d23d2019-05-02 14:56:30 -040015847
15848static int
15849config_get_codec_name(wchar_t **config_encoding)
15850{
15851 char *encoding;
15852 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15853 return -1;
15854 }
15855
15856 PyObject *name_obj = NULL;
15857 PyObject *codec = _PyCodec_Lookup(encoding);
15858 PyMem_RawFree(encoding);
15859
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015860 if (!codec)
15861 goto error;
15862
15863 name_obj = PyObject_GetAttrString(codec, "name");
15864 Py_CLEAR(codec);
15865 if (!name_obj) {
15866 goto error;
15867 }
15868
Victor Stinner709d23d2019-05-02 14:56:30 -040015869 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15870 Py_DECREF(name_obj);
15871 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015872 goto error;
15873 }
15874
Victor Stinner709d23d2019-05-02 14:56:30 -040015875 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15876 if (raw_wname == NULL) {
15877 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015878 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040015879 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015880 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015881
15882 PyMem_RawFree(*config_encoding);
15883 *config_encoding = raw_wname;
15884
15885 PyMem_Free(wname);
15886 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015887
15888error:
15889 Py_XDECREF(codec);
15890 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040015891 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015892}
15893
15894
Victor Stinner331a6a52019-05-27 16:39:22 +020015895static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015896init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015897{
Victor Stinner709d23d2019-05-02 14:56:30 -040015898 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerda7933e2020-04-13 03:04:28 +020015899 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(tstate->interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040015900 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015901 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015902 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015903 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015904 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015905}
15906
15907
Victor Stinner709d23d2019-05-02 14:56:30 -040015908static int
15909init_fs_codec(PyInterpreterState *interp)
15910{
Victor Stinnerda7933e2020-04-13 03:04:28 +020015911 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040015912
15913 _Py_error_handler error_handler;
15914 error_handler = get_error_handler_wide(config->filesystem_errors);
15915 if (error_handler == _Py_ERROR_UNKNOWN) {
15916 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15917 return -1;
15918 }
15919
15920 char *encoding, *errors;
15921 if (encode_wstr_utf8(config->filesystem_encoding,
15922 &encoding,
15923 "filesystem_encoding") < 0) {
15924 return -1;
15925 }
15926
15927 if (encode_wstr_utf8(config->filesystem_errors,
15928 &errors,
15929 "filesystem_errors") < 0) {
15930 PyMem_RawFree(encoding);
15931 return -1;
15932 }
15933
15934 PyMem_RawFree(interp->fs_codec.encoding);
15935 interp->fs_codec.encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010015936 /* encoding has been normalized by init_fs_encoding() */
15937 interp->fs_codec.utf8 = (strcmp(encoding, "utf-8") == 0);
Victor Stinner709d23d2019-05-02 14:56:30 -040015938 PyMem_RawFree(interp->fs_codec.errors);
15939 interp->fs_codec.errors = errors;
15940 interp->fs_codec.error_handler = error_handler;
15941
Victor Stinnerbf305cc2020-02-05 17:39:57 +010015942#ifdef _Py_FORCE_UTF8_FS_ENCODING
15943 assert(interp->fs_codec.utf8 == 1);
15944#endif
15945
Victor Stinner709d23d2019-05-02 14:56:30 -040015946 /* At this point, PyUnicode_EncodeFSDefault() and
15947 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15948 the C implementation of the filesystem encoding. */
15949
15950 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15951 global configuration variables. */
15952 if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
15953 interp->fs_codec.errors) < 0) {
15954 PyErr_NoMemory();
15955 return -1;
15956 }
15957 return 0;
15958}
15959
15960
Victor Stinner331a6a52019-05-27 16:39:22 +020015961static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015962init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015963{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015964 PyInterpreterState *interp = tstate->interp;
15965
Victor Stinner709d23d2019-05-02 14:56:30 -040015966 /* Update the filesystem encoding to the normalized Python codec name.
15967 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15968 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020015969 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040015970 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015971 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020015972 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015973 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015974 }
15975
Victor Stinner709d23d2019-05-02 14:56:30 -040015976 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015977 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015978 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015979 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015980}
15981
15982
Victor Stinner331a6a52019-05-27 16:39:22 +020015983PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020015984_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015985{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015986 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020015987 if (_PyStatus_EXCEPTION(status)) {
15988 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015989 }
15990
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015991 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015992}
15993
15994
Victor Stinnerbf305cc2020-02-05 17:39:57 +010015995static void
15996_PyUnicode_FiniEncodings(PyThreadState *tstate)
15997{
15998 PyInterpreterState *interp = tstate->interp;
15999 PyMem_RawFree(interp->fs_codec.encoding);
16000 interp->fs_codec.encoding = NULL;
16001 interp->fs_codec.utf8 = 0;
16002 PyMem_RawFree(interp->fs_codec.errors);
16003 interp->fs_codec.errors = NULL;
16004 interp->fs_codec.error_handler = _Py_ERROR_UNKNOWN;
16005}
16006
16007
Victor Stinner709d23d2019-05-02 14:56:30 -040016008#ifdef MS_WINDOWS
16009int
16010_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16011{
Victor Stinner81a7be32020-04-14 15:14:01 +020016012 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016013 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016014
16015 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16016 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16017 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16018 if (encoding == NULL || errors == NULL) {
16019 PyMem_RawFree(encoding);
16020 PyMem_RawFree(errors);
16021 PyErr_NoMemory();
16022 return -1;
16023 }
16024
16025 PyMem_RawFree(config->filesystem_encoding);
16026 config->filesystem_encoding = encoding;
16027 PyMem_RawFree(config->filesystem_errors);
16028 config->filesystem_errors = errors;
16029
16030 return init_fs_codec(interp);
16031}
16032#endif
16033
16034
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016035void
Victor Stinner3d483342019-11-22 12:27:50 +010016036_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016037{
Victor Stinner3d483342019-11-22 12:27:50 +010016038 if (_Py_IsMainInterpreter(tstate)) {
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016039#if defined(WITH_VALGRIND) || defined(__INSURE__)
Victor Stinner3d483342019-11-22 12:27:50 +010016040 /* Insure++ is a memory analysis tool that aids in discovering
16041 * memory leaks and other memory problems. On Python exit, the
16042 * interned string dictionaries are flagged as being in use at exit
16043 * (which it is). Under normal circumstances, this is fine because
16044 * the memory will be automatically reclaimed by the system. Under
16045 * memory debugging, it's a huge source of useless noise, so we
16046 * trade off slower shutdown for less distraction in the memory
16047 * reports. -baw
16048 */
16049 unicode_release_interned();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016050#endif /* __INSURE__ */
16051
Victor Stinner3d483342019-11-22 12:27:50 +010016052 Py_CLEAR(unicode_empty);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016053
Victor Stinner3d483342019-11-22 12:27:50 +010016054 for (Py_ssize_t i = 0; i < 256; i++) {
16055 Py_CLEAR(unicode_latin1[i]);
16056 }
16057 _PyUnicode_ClearStaticStrings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016058 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016059
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016060 _PyUnicode_FiniEncodings(tstate);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016061}
16062
16063
Georg Brandl66c221e2010-10-14 07:04:07 +000016064/* A _string module, to export formatter_parser and formatter_field_name_split
16065 to the string.Formatter class implemented in Python. */
16066
16067static PyMethodDef _string_methods[] = {
16068 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16069 METH_O, PyDoc_STR("split the argument as a field name")},
16070 {"formatter_parser", (PyCFunction) formatter_parser,
16071 METH_O, PyDoc_STR("parse the argument as a format string")},
16072 {NULL, NULL}
16073};
16074
16075static struct PyModuleDef _string_module = {
16076 PyModuleDef_HEAD_INIT,
16077 "_string",
16078 PyDoc_STR("string helper module"),
16079 0,
16080 _string_methods,
16081 NULL,
16082 NULL,
16083 NULL,
16084 NULL
16085};
16086
16087PyMODINIT_FUNC
16088PyInit__string(void)
16089{
16090 return PyModule_Create(&_string_module);
16091}
16092
16093
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016094#ifdef __cplusplus
16095}
16096#endif