blob: 04ca5f3344470e7b14d72654f60b49a01de99831 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010043#include "pycore_fileutils.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010044#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000045#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050046#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070047#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Larry Hastings61272b72014-01-07 12:41:53 -080053/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090054class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080055[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090056/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
57
58/*[python input]
59class Py_UCS4_converter(CConverter):
60 type = 'Py_UCS4'
61 converter = 'convert_uc'
62
63 def converter_init(self):
64 if self.default is not unspecified:
65 self.c_default = ascii(self.default)
66 if len(self.c_default) > 4 or self.c_default[0] != "'":
67 self.c_default = hex(ord(self.default))
68
69[python start generated code]*/
70/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080071
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000072/* --- Globals ------------------------------------------------------------
73
Serhiy Storchaka05997252013-01-26 12:14:02 +020074NOTE: In the interpreter's initialization phase, some globals are currently
75 initialized dynamically as needed. In the process Unicode objects may
76 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000077
78*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000079
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000080
81#ifdef __cplusplus
82extern "C" {
83#endif
84
Victor Stinner8faf8212011-12-08 22:14:11 +010085/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
86#define MAX_UNICODE 0x10ffff
87
Victor Stinner910337b2011-10-03 03:20:16 +020088#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020089# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020090#else
91# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
92#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020093
Victor Stinnere90fe6a2011-10-01 16:48:13 +020094#define _PyUnicode_UTF8(op) \
95 (((PyCompactUnicodeObject*)(op))->utf8)
96#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020097 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020098 assert(PyUnicode_IS_READY(op)), \
99 PyUnicode_IS_COMPACT_ASCII(op) ? \
100 ((char*)((PyASCIIObject*)(op) + 1)) : \
101 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200102#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200103 (((PyCompactUnicodeObject*)(op))->utf8_length)
104#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((PyASCIIObject*)(op))->length : \
109 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_WSTR(op) \
111 (((PyASCIIObject*)(op))->wstr)
112#define _PyUnicode_WSTR_LENGTH(op) \
113 (((PyCompactUnicodeObject*)(op))->wstr_length)
114#define _PyUnicode_LENGTH(op) \
115 (((PyASCIIObject *)(op))->length)
116#define _PyUnicode_STATE(op) \
117 (((PyASCIIObject *)(op))->state)
118#define _PyUnicode_HASH(op) \
119 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200120#define _PyUnicode_KIND(op) \
121 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200122 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200123#define _PyUnicode_GET_LENGTH(op) \
124 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200126#define _PyUnicode_DATA_ANY(op) \
127 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200128
Victor Stinner910337b2011-10-03 03:20:16 +0200129#undef PyUnicode_READY
130#define PyUnicode_READY(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200133 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100134 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200135
Victor Stinnerc379ead2011-10-03 12:52:27 +0200136#define _PyUnicode_SHARE_UTF8(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
139 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
140#define _PyUnicode_SHARE_WSTR(op) \
141 (assert(_PyUnicode_CHECK(op)), \
142 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
143
Victor Stinner829c0ad2011-10-03 01:08:02 +0200144/* true if the Unicode object has an allocated UTF-8 memory block
145 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200146#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200147 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200148 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200149 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
150
Victor Stinner03490912011-10-03 23:45:12 +0200151/* true if the Unicode object has an allocated wstr memory block
152 (not shared with other data) */
153#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200154 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200155 (!PyUnicode_IS_READY(op) || \
156 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
157
Victor Stinner910337b2011-10-03 03:20:16 +0200158/* Generic helper macro to convert characters of different types.
159 from_type and to_type have to be valid type names, begin and end
160 are pointers to the source characters which should be of type
161 "from_type *". to is a pointer of type "to_type *" and points to the
162 buffer where the result characters are written to. */
163#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
164 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100165 to_type *_to = (to_type *)(to); \
166 const from_type *_iter = (from_type *)(begin); \
167 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200168 Py_ssize_t n = (_end) - (_iter); \
169 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200170 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200171 while (_iter < (_unrolled_end)) { \
172 _to[0] = (to_type) _iter[0]; \
173 _to[1] = (to_type) _iter[1]; \
174 _to[2] = (to_type) _iter[2]; \
175 _to[3] = (to_type) _iter[3]; \
176 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200177 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200178 while (_iter < (_end)) \
179 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200180 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200181
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200182#ifdef MS_WINDOWS
183 /* On Windows, overallocate by 50% is the best factor */
184# define OVERALLOCATE_FACTOR 2
185#else
186 /* On Linux, overallocate by 25% is the best factor */
187# define OVERALLOCATE_FACTOR 4
188#endif
189
Walter Dörwald16807132007-05-25 13:52:07 +0000190/* This dictionary holds all interned unicode strings. Note that references
191 to strings in this dictionary are *not* counted in the string's ob_refcnt.
192 When the interned string reaches a refcnt of 0 the string deallocation
193 function will delete the reference from this dictionary.
194
195 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000196 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000197*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200198static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000199
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000200/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200201static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200202
Serhiy Storchaka678db842013-01-26 12:16:36 +0200203#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200204 do { \
205 if (unicode_empty != NULL) \
206 Py_INCREF(unicode_empty); \
207 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200208 unicode_empty = PyUnicode_New(0, 0); \
209 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200210 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
212 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200213 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200214 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000215
Serhiy Storchaka678db842013-01-26 12:16:36 +0200216#define _Py_RETURN_UNICODE_EMPTY() \
217 do { \
218 _Py_INCREF_UNICODE_EMPTY(); \
219 return unicode_empty; \
220 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000221
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200222/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700223static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200224_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
225
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200226/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200227static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200228
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000229/* Single character Unicode strings in the Latin-1 range are being
230 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200231static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000232
Christian Heimes190d79e2008-01-30 11:58:22 +0000233/* Fast detection of the most frequent whitespace characters */
234const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000235 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000236/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000237/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000238/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000239/* case 0x000C: * FORM FEED */
240/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000241 0, 1, 1, 1, 1, 1, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000243/* case 0x001C: * FILE SEPARATOR */
244/* case 0x001D: * GROUP SEPARATOR */
245/* case 0x001E: * RECORD SEPARATOR */
246/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000247 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000248/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000249 1, 0, 0, 0, 0, 0, 0, 0,
250 0, 0, 0, 0, 0, 0, 0, 0,
251 0, 0, 0, 0, 0, 0, 0, 0,
252 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000253
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
255 0, 0, 0, 0, 0, 0, 0, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0,
260 0, 0, 0, 0, 0, 0, 0, 0,
261 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000262};
263
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200264/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200265static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200266static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100267static int unicode_modifiable(PyObject *unicode);
268
Victor Stinnerfe226c02011-10-03 03:52:20 +0200269
Alexander Belopolsky40018472011-02-26 01:02:56 +0000270static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100271_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200272static PyObject *
273_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
274static PyObject *
275_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
276
277static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000278unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000279 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100280 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000281 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
282
Alexander Belopolsky40018472011-02-26 01:02:56 +0000283static void
284raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300285 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100286 PyObject *unicode,
287 Py_ssize_t startpos, Py_ssize_t endpos,
288 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000289
Christian Heimes190d79e2008-01-30 11:58:22 +0000290/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200291static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000293/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000294/* 0x000B, * LINE TABULATION */
295/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000296/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000297 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000298 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000299/* 0x001C, * FILE SEPARATOR */
300/* 0x001D, * GROUP SEPARATOR */
301/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000302 0, 0, 0, 0, 1, 1, 1, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
306 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000307
Benjamin Peterson14339b62009-01-31 16:36:08 +0000308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0,
315 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000316};
317
INADA Naoki3ae20562017-01-16 20:41:20 +0900318static int convert_uc(PyObject *obj, void *addr);
319
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300320#include "clinic/unicodeobject.c.h"
321
Victor Stinner3d4226a2018-08-29 22:21:32 +0200322_Py_error_handler
323_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200324{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200325 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200326 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200327 }
328 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200329 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200330 }
331 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200332 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200333 }
334 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200335 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200336 }
337 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200338 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200339 }
340 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200341 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200342 }
343 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200344 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200345 }
Victor Stinner50149202015-09-22 00:26:54 +0200346 return _Py_ERROR_OTHER;
347}
348
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300349/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
350 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000351Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000352PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000353{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000354#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000355 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000356#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000357 /* This is actually an illegal character, so it should
358 not be passed to unichr. */
359 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000360#endif
361}
362
Victor Stinner910337b2011-10-03 03:20:16 +0200363#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200364int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100365_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200366{
Victor Stinner50fe3f82018-10-26 18:47:15 +0200367#define ASSERT(expr) _PyObject_ASSERT(op, (expr))
368
Victor Stinner910337b2011-10-03 03:20:16 +0200369 PyASCIIObject *ascii;
370 unsigned int kind;
371
Victor Stinner50fe3f82018-10-26 18:47:15 +0200372 ASSERT(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200373
374 ascii = (PyASCIIObject *)op;
375 kind = ascii->state.kind;
376
Victor Stinnera3b334d2011-10-03 13:53:37 +0200377 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200378 ASSERT(kind == PyUnicode_1BYTE_KIND);
379 ASSERT(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200380 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200381 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200382 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200383 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200384
Victor Stinnera41463c2011-10-04 01:05:08 +0200385 if (ascii->state.compact == 1) {
386 data = compact + 1;
Victor Stinner50fe3f82018-10-26 18:47:15 +0200387 ASSERT(kind == PyUnicode_1BYTE_KIND
Victor Stinner910337b2011-10-03 03:20:16 +0200388 || kind == PyUnicode_2BYTE_KIND
389 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner50fe3f82018-10-26 18:47:15 +0200390 ASSERT(ascii->state.ascii == 0);
391 ASSERT(ascii->state.ready == 1);
392 ASSERT (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100393 }
394 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200395 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
396
397 data = unicode->data.any;
398 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200399 ASSERT(ascii->length == 0);
400 ASSERT(ascii->hash == -1);
401 ASSERT(ascii->state.compact == 0);
402 ASSERT(ascii->state.ascii == 0);
403 ASSERT(ascii->state.ready == 0);
404 ASSERT(ascii->state.interned == SSTATE_NOT_INTERNED);
405 ASSERT(ascii->wstr != NULL);
406 ASSERT(data == NULL);
407 ASSERT(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200408 }
409 else {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200410 ASSERT(kind == PyUnicode_1BYTE_KIND
Victor Stinnera41463c2011-10-04 01:05:08 +0200411 || kind == PyUnicode_2BYTE_KIND
412 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner50fe3f82018-10-26 18:47:15 +0200413 ASSERT(ascii->state.compact == 0);
414 ASSERT(ascii->state.ready == 1);
415 ASSERT(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200416 if (ascii->state.ascii) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200417 ASSERT (compact->utf8 == data);
418 ASSERT (compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200419 }
420 else
Victor Stinner50fe3f82018-10-26 18:47:15 +0200421 ASSERT (compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200422 }
423 }
424 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200425 if (
426#if SIZEOF_WCHAR_T == 2
427 kind == PyUnicode_2BYTE_KIND
428#else
429 kind == PyUnicode_4BYTE_KIND
430#endif
431 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200432 {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200433 ASSERT(ascii->wstr == data);
434 ASSERT(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200435 } else
Victor Stinner50fe3f82018-10-26 18:47:15 +0200436 ASSERT(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200437 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200438
439 if (compact->utf8 == NULL)
Victor Stinner50fe3f82018-10-26 18:47:15 +0200440 ASSERT(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200441 if (ascii->wstr == NULL)
Victor Stinner50fe3f82018-10-26 18:47:15 +0200442 ASSERT(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200443 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200444 /* check that the best kind is used */
445 if (check_content && kind != PyUnicode_WCHAR_KIND)
446 {
447 Py_ssize_t i;
448 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200449 void *data;
450 Py_UCS4 ch;
451
452 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 for (i=0; i < ascii->length; i++)
454 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200455 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200456 if (ch > maxchar)
457 maxchar = ch;
458 }
459 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100460 if (ascii->state.ascii == 0) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200461 ASSERT(maxchar >= 128);
462 ASSERT(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100463 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200464 else
Victor Stinner50fe3f82018-10-26 18:47:15 +0200465 ASSERT(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200466 }
Victor Stinner77faf692011-11-20 18:56:05 +0100467 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200468 ASSERT(maxchar >= 0x100);
469 ASSERT(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100470 }
471 else {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200472 ASSERT(maxchar >= 0x10000);
473 ASSERT(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100474 }
Victor Stinner50fe3f82018-10-26 18:47:15 +0200475 ASSERT(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200476 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400477 return 1;
Victor Stinner50fe3f82018-10-26 18:47:15 +0200478
479#undef ASSERT
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400480}
Victor Stinner910337b2011-10-03 03:20:16 +0200481#endif
482
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100483static PyObject*
484unicode_result_wchar(PyObject *unicode)
485{
486#ifndef Py_DEBUG
487 Py_ssize_t len;
488
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100489 len = _PyUnicode_WSTR_LENGTH(unicode);
490 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100491 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200492 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100493 }
494
495 if (len == 1) {
496 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100497 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100498 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
499 Py_DECREF(unicode);
500 return latin1_char;
501 }
502 }
503
504 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200505 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100506 return NULL;
507 }
508#else
Victor Stinneraa771272012-10-04 02:32:58 +0200509 assert(Py_REFCNT(unicode) == 1);
510
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100511 /* don't make the result ready in debug mode to ensure that the caller
512 makes the string ready before using it */
513 assert(_PyUnicode_CheckConsistency(unicode, 1));
514#endif
515 return unicode;
516}
517
518static PyObject*
519unicode_result_ready(PyObject *unicode)
520{
521 Py_ssize_t length;
522
523 length = PyUnicode_GET_LENGTH(unicode);
524 if (length == 0) {
525 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100526 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200527 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100528 }
529 return unicode_empty;
530 }
531
532 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200533 void *data = PyUnicode_DATA(unicode);
534 int kind = PyUnicode_KIND(unicode);
535 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100536 if (ch < 256) {
537 PyObject *latin1_char = unicode_latin1[ch];
538 if (latin1_char != NULL) {
539 if (unicode != latin1_char) {
540 Py_INCREF(latin1_char);
541 Py_DECREF(unicode);
542 }
543 return latin1_char;
544 }
545 else {
546 assert(_PyUnicode_CheckConsistency(unicode, 1));
547 Py_INCREF(unicode);
548 unicode_latin1[ch] = unicode;
549 return unicode;
550 }
551 }
552 }
553
554 assert(_PyUnicode_CheckConsistency(unicode, 1));
555 return unicode;
556}
557
558static PyObject*
559unicode_result(PyObject *unicode)
560{
561 assert(_PyUnicode_CHECK(unicode));
562 if (PyUnicode_IS_READY(unicode))
563 return unicode_result_ready(unicode);
564 else
565 return unicode_result_wchar(unicode);
566}
567
Victor Stinnerc4b49542011-12-11 22:44:26 +0100568static PyObject*
569unicode_result_unchanged(PyObject *unicode)
570{
571 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500572 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100573 return NULL;
574 Py_INCREF(unicode);
575 return unicode;
576 }
577 else
578 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100579 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100580}
581
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200582/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
583 ASCII, Latin1, UTF-8, etc. */
584static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200585backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200586 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
587{
Victor Stinnerad771582015-10-09 12:38:53 +0200588 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200589 Py_UCS4 ch;
590 enum PyUnicode_Kind kind;
591 void *data;
592
593 assert(PyUnicode_IS_READY(unicode));
594 kind = PyUnicode_KIND(unicode);
595 data = PyUnicode_DATA(unicode);
596
597 size = 0;
598 /* determine replacement size */
599 for (i = collstart; i < collend; ++i) {
600 Py_ssize_t incr;
601
602 ch = PyUnicode_READ(kind, data, i);
603 if (ch < 0x100)
604 incr = 2+2;
605 else if (ch < 0x10000)
606 incr = 2+4;
607 else {
608 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200609 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200610 }
611 if (size > PY_SSIZE_T_MAX - incr) {
612 PyErr_SetString(PyExc_OverflowError,
613 "encoded result is too long for a Python string");
614 return NULL;
615 }
616 size += incr;
617 }
618
Victor Stinnerad771582015-10-09 12:38:53 +0200619 str = _PyBytesWriter_Prepare(writer, str, size);
620 if (str == NULL)
621 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200622
623 /* generate replacement */
624 for (i = collstart; i < collend; ++i) {
625 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200626 *str++ = '\\';
627 if (ch >= 0x00010000) {
628 *str++ = 'U';
629 *str++ = Py_hexdigits[(ch>>28)&0xf];
630 *str++ = Py_hexdigits[(ch>>24)&0xf];
631 *str++ = Py_hexdigits[(ch>>20)&0xf];
632 *str++ = Py_hexdigits[(ch>>16)&0xf];
633 *str++ = Py_hexdigits[(ch>>12)&0xf];
634 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200635 }
Victor Stinner797485e2015-10-09 03:17:30 +0200636 else if (ch >= 0x100) {
637 *str++ = 'u';
638 *str++ = Py_hexdigits[(ch>>12)&0xf];
639 *str++ = Py_hexdigits[(ch>>8)&0xf];
640 }
641 else
642 *str++ = 'x';
643 *str++ = Py_hexdigits[(ch>>4)&0xf];
644 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200645 }
646 return str;
647}
648
649/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
650 ASCII, Latin1, UTF-8, etc. */
651static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200652xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200653 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
654{
Victor Stinnerad771582015-10-09 12:38:53 +0200655 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200656 Py_UCS4 ch;
657 enum PyUnicode_Kind kind;
658 void *data;
659
660 assert(PyUnicode_IS_READY(unicode));
661 kind = PyUnicode_KIND(unicode);
662 data = PyUnicode_DATA(unicode);
663
664 size = 0;
665 /* determine replacement size */
666 for (i = collstart; i < collend; ++i) {
667 Py_ssize_t incr;
668
669 ch = PyUnicode_READ(kind, data, i);
670 if (ch < 10)
671 incr = 2+1+1;
672 else if (ch < 100)
673 incr = 2+2+1;
674 else if (ch < 1000)
675 incr = 2+3+1;
676 else if (ch < 10000)
677 incr = 2+4+1;
678 else if (ch < 100000)
679 incr = 2+5+1;
680 else if (ch < 1000000)
681 incr = 2+6+1;
682 else {
683 assert(ch <= MAX_UNICODE);
684 incr = 2+7+1;
685 }
686 if (size > PY_SSIZE_T_MAX - incr) {
687 PyErr_SetString(PyExc_OverflowError,
688 "encoded result is too long for a Python string");
689 return NULL;
690 }
691 size += incr;
692 }
693
Victor Stinnerad771582015-10-09 12:38:53 +0200694 str = _PyBytesWriter_Prepare(writer, str, size);
695 if (str == NULL)
696 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200697
698 /* generate replacement */
699 for (i = collstart; i < collend; ++i) {
700 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
701 }
702 return str;
703}
704
Thomas Wouters477c8d52006-05-27 19:21:47 +0000705/* --- Bloom Filters ----------------------------------------------------- */
706
707/* stuff to implement simple "bloom filters" for Unicode characters.
708 to keep things simple, we use a single bitmask, using the least 5
709 bits from each unicode characters as the bit index. */
710
711/* the linebreak mask is set up by Unicode_Init below */
712
Antoine Pitrouf068f942010-01-13 14:19:12 +0000713#if LONG_BIT >= 128
714#define BLOOM_WIDTH 128
715#elif LONG_BIT >= 64
716#define BLOOM_WIDTH 64
717#elif LONG_BIT >= 32
718#define BLOOM_WIDTH 32
719#else
720#error "LONG_BIT is smaller than 32"
721#endif
722
Thomas Wouters477c8d52006-05-27 19:21:47 +0000723#define BLOOM_MASK unsigned long
724
Serhiy Storchaka05997252013-01-26 12:14:02 +0200725static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000726
Antoine Pitrouf068f942010-01-13 14:19:12 +0000727#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000728
Benjamin Peterson29060642009-01-31 22:14:21 +0000729#define BLOOM_LINEBREAK(ch) \
730 ((ch) < 128U ? ascii_linebreak[(ch)] : \
731 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000732
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700733static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200734make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000735{
Victor Stinnera85af502013-04-09 21:53:54 +0200736#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
737 do { \
738 TYPE *data = (TYPE *)PTR; \
739 TYPE *end = data + LEN; \
740 Py_UCS4 ch; \
741 for (; data != end; data++) { \
742 ch = *data; \
743 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
744 } \
745 break; \
746 } while (0)
747
Thomas Wouters477c8d52006-05-27 19:21:47 +0000748 /* calculate simple bloom-style bitmask for a given unicode string */
749
Antoine Pitrouf068f942010-01-13 14:19:12 +0000750 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000751
752 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200753 switch (kind) {
754 case PyUnicode_1BYTE_KIND:
755 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
756 break;
757 case PyUnicode_2BYTE_KIND:
758 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
759 break;
760 case PyUnicode_4BYTE_KIND:
761 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
762 break;
763 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700764 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200765 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000766 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200767
768#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000769}
770
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300771static int
772ensure_unicode(PyObject *obj)
773{
774 if (!PyUnicode_Check(obj)) {
775 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200776 "must be str, not %.100s",
777 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300778 return -1;
779 }
780 return PyUnicode_READY(obj);
781}
782
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200783/* Compilation of templated routines */
784
785#include "stringlib/asciilib.h"
786#include "stringlib/fastsearch.h"
787#include "stringlib/partition.h"
788#include "stringlib/split.h"
789#include "stringlib/count.h"
790#include "stringlib/find.h"
791#include "stringlib/find_max_char.h"
792#include "stringlib/localeutil.h"
793#include "stringlib/undef.h"
794
795#include "stringlib/ucs1lib.h"
796#include "stringlib/fastsearch.h"
797#include "stringlib/partition.h"
798#include "stringlib/split.h"
799#include "stringlib/count.h"
800#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300801#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200802#include "stringlib/find_max_char.h"
803#include "stringlib/localeutil.h"
804#include "stringlib/undef.h"
805
806#include "stringlib/ucs2lib.h"
807#include "stringlib/fastsearch.h"
808#include "stringlib/partition.h"
809#include "stringlib/split.h"
810#include "stringlib/count.h"
811#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300812#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200813#include "stringlib/find_max_char.h"
814#include "stringlib/localeutil.h"
815#include "stringlib/undef.h"
816
817#include "stringlib/ucs4lib.h"
818#include "stringlib/fastsearch.h"
819#include "stringlib/partition.h"
820#include "stringlib/split.h"
821#include "stringlib/count.h"
822#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300823#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200824#include "stringlib/find_max_char.h"
825#include "stringlib/localeutil.h"
826#include "stringlib/undef.h"
827
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200828#include "stringlib/unicodedefs.h"
829#include "stringlib/fastsearch.h"
830#include "stringlib/count.h"
831#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100832#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200833
Guido van Rossumd57fd912000-03-10 22:53:23 +0000834/* --- Unicode Object ----------------------------------------------------- */
835
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700836static inline Py_ssize_t
837findchar(const void *s, int kind,
838 Py_ssize_t size, Py_UCS4 ch,
839 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200840{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200841 switch (kind) {
842 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200843 if ((Py_UCS1) ch != ch)
844 return -1;
845 if (direction > 0)
846 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
847 else
848 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200849 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200850 if ((Py_UCS2) ch != ch)
851 return -1;
852 if (direction > 0)
853 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
854 else
855 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200856 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200857 if (direction > 0)
858 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
859 else
860 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200861 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700862 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200863 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200864}
865
Victor Stinnerafffce42012-10-03 23:03:17 +0200866#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000867/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200868 earlier.
869
870 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
871 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
872 invalid character in Unicode 6.0. */
873static void
874unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
875{
876 int kind = PyUnicode_KIND(unicode);
877 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
878 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
879 if (length <= old_length)
880 return;
881 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
882}
883#endif
884
Victor Stinnerfe226c02011-10-03 03:52:20 +0200885static PyObject*
886resize_compact(PyObject *unicode, Py_ssize_t length)
887{
888 Py_ssize_t char_size;
889 Py_ssize_t struct_size;
890 Py_ssize_t new_size;
891 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100892 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200893#ifdef Py_DEBUG
894 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
895#endif
896
Victor Stinner79891572012-05-03 13:43:07 +0200897 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200898 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100899 assert(PyUnicode_IS_COMPACT(unicode));
900
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200901 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100902 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200903 struct_size = sizeof(PyASCIIObject);
904 else
905 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200906 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200907
Victor Stinnerfe226c02011-10-03 03:52:20 +0200908 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
909 PyErr_NoMemory();
910 return NULL;
911 }
912 new_size = (struct_size + (length + 1) * char_size);
913
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200914 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
915 PyObject_DEL(_PyUnicode_UTF8(unicode));
916 _PyUnicode_UTF8(unicode) = NULL;
917 _PyUnicode_UTF8_LENGTH(unicode) = 0;
918 }
Victor Stinner84def372011-12-11 20:04:56 +0100919 _Py_DEC_REFTOTAL;
920 _Py_ForgetReference(unicode);
921
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300922 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100923 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100924 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200925 PyErr_NoMemory();
926 return NULL;
927 }
Victor Stinner84def372011-12-11 20:04:56 +0100928 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200929 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100930
Victor Stinnerfe226c02011-10-03 03:52:20 +0200931 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200932 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200933 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100934 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200935 _PyUnicode_WSTR_LENGTH(unicode) = length;
936 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100937 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
938 PyObject_DEL(_PyUnicode_WSTR(unicode));
939 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100940 if (!PyUnicode_IS_ASCII(unicode))
941 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100942 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200943#ifdef Py_DEBUG
944 unicode_fill_invalid(unicode, old_length);
945#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200946 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
947 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200948 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200949 return unicode;
950}
951
Alexander Belopolsky40018472011-02-26 01:02:56 +0000952static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200953resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000954{
Victor Stinner95663112011-10-04 01:03:50 +0200955 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100956 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200957 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200958 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000959
Victor Stinnerfe226c02011-10-03 03:52:20 +0200960 if (PyUnicode_IS_READY(unicode)) {
961 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200962 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200963 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200964#ifdef Py_DEBUG
965 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
966#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200967
968 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200969 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200970 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
971 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200972
973 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
974 PyErr_NoMemory();
975 return -1;
976 }
977 new_size = (length + 1) * char_size;
978
Victor Stinner7a9105a2011-12-12 00:13:42 +0100979 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
980 {
981 PyObject_DEL(_PyUnicode_UTF8(unicode));
982 _PyUnicode_UTF8(unicode) = NULL;
983 _PyUnicode_UTF8_LENGTH(unicode) = 0;
984 }
985
Victor Stinnerfe226c02011-10-03 03:52:20 +0200986 data = (PyObject *)PyObject_REALLOC(data, new_size);
987 if (data == NULL) {
988 PyErr_NoMemory();
989 return -1;
990 }
991 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200992 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200993 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200994 _PyUnicode_WSTR_LENGTH(unicode) = length;
995 }
996 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200997 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200998 _PyUnicode_UTF8_LENGTH(unicode) = length;
999 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001000 _PyUnicode_LENGTH(unicode) = length;
1001 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001002#ifdef Py_DEBUG
1003 unicode_fill_invalid(unicode, old_length);
1004#endif
Victor Stinner95663112011-10-04 01:03:50 +02001005 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001006 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001007 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001008 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001009 }
Victor Stinner95663112011-10-04 01:03:50 +02001010 assert(_PyUnicode_WSTR(unicode) != NULL);
1011
1012 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001013 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001014 PyErr_NoMemory();
1015 return -1;
1016 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001017 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001018 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001019 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001020 if (!wstr) {
1021 PyErr_NoMemory();
1022 return -1;
1023 }
1024 _PyUnicode_WSTR(unicode) = wstr;
1025 _PyUnicode_WSTR(unicode)[length] = 0;
1026 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001027 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001028 return 0;
1029}
1030
Victor Stinnerfe226c02011-10-03 03:52:20 +02001031static PyObject*
1032resize_copy(PyObject *unicode, Py_ssize_t length)
1033{
1034 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001035 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001036 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001037
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001038 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001039
1040 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1041 if (copy == NULL)
1042 return NULL;
1043
1044 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001045 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001046 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001047 }
1048 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001049 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001050
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001051 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001052 if (w == NULL)
1053 return NULL;
1054 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1055 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001056 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001057 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001058 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001059 }
1060}
1061
Guido van Rossumd57fd912000-03-10 22:53:23 +00001062/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001063 Ux0000 terminated; some code (e.g. new_identifier)
1064 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001065
1066 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001067 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068
1069*/
1070
Alexander Belopolsky40018472011-02-26 01:02:56 +00001071static PyUnicodeObject *
1072_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001073{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001074 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001075 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001076
Thomas Wouters477c8d52006-05-27 19:21:47 +00001077 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001078 if (length == 0 && unicode_empty != NULL) {
1079 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001080 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001081 }
1082
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001083 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001084 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001085 return (PyUnicodeObject *)PyErr_NoMemory();
1086 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001087 if (length < 0) {
1088 PyErr_SetString(PyExc_SystemError,
1089 "Negative size passed to _PyUnicode_New");
1090 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 }
1092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001093 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1094 if (unicode == NULL)
1095 return NULL;
1096 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001097
1098 _PyUnicode_WSTR_LENGTH(unicode) = length;
1099 _PyUnicode_HASH(unicode) = -1;
1100 _PyUnicode_STATE(unicode).interned = 0;
1101 _PyUnicode_STATE(unicode).kind = 0;
1102 _PyUnicode_STATE(unicode).compact = 0;
1103 _PyUnicode_STATE(unicode).ready = 0;
1104 _PyUnicode_STATE(unicode).ascii = 0;
1105 _PyUnicode_DATA_ANY(unicode) = NULL;
1106 _PyUnicode_LENGTH(unicode) = 0;
1107 _PyUnicode_UTF8(unicode) = NULL;
1108 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1109
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001110 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1111 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001112 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001113 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001114 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001115 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001116
Jeremy Hyltond8082792003-09-16 19:41:39 +00001117 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001118 * the caller fails before initializing str -- unicode_resize()
1119 * reads str[0], and the Keep-Alive optimization can keep memory
1120 * allocated for str alive across a call to unicode_dealloc(unicode).
1121 * We don't want unicode_resize to read uninitialized memory in
1122 * that case.
1123 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001124 _PyUnicode_WSTR(unicode)[0] = 0;
1125 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001126
Victor Stinner7931d9a2011-11-04 00:22:48 +01001127 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128 return unicode;
1129}
1130
Victor Stinnerf42dc442011-10-02 23:33:16 +02001131static const char*
1132unicode_kind_name(PyObject *unicode)
1133{
Victor Stinner42dfd712011-10-03 14:41:45 +02001134 /* don't check consistency: unicode_kind_name() is called from
1135 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001136 if (!PyUnicode_IS_COMPACT(unicode))
1137 {
1138 if (!PyUnicode_IS_READY(unicode))
1139 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001140 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001141 {
1142 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001143 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001144 return "legacy ascii";
1145 else
1146 return "legacy latin1";
1147 case PyUnicode_2BYTE_KIND:
1148 return "legacy UCS2";
1149 case PyUnicode_4BYTE_KIND:
1150 return "legacy UCS4";
1151 default:
1152 return "<legacy invalid kind>";
1153 }
1154 }
1155 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001156 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001157 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001158 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001159 return "ascii";
1160 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001161 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001162 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001163 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001164 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001165 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001166 default:
1167 return "<invalid compact kind>";
1168 }
1169}
1170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001172/* Functions wrapping macros for use in debugger */
1173char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001174 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001175}
1176
1177void *_PyUnicode_compact_data(void *unicode) {
1178 return _PyUnicode_COMPACT_DATA(unicode);
1179}
1180void *_PyUnicode_data(void *unicode){
1181 printf("obj %p\n", unicode);
1182 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1183 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1184 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1185 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1186 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1187 return PyUnicode_DATA(unicode);
1188}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001189
1190void
1191_PyUnicode_Dump(PyObject *op)
1192{
1193 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001194 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1195 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1196 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001197
Victor Stinnera849a4b2011-10-03 12:12:11 +02001198 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001199 {
1200 if (ascii->state.ascii)
1201 data = (ascii + 1);
1202 else
1203 data = (compact + 1);
1204 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001205 else
1206 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001207 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1208 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001209
Victor Stinnera849a4b2011-10-03 12:12:11 +02001210 if (ascii->wstr == data)
1211 printf("shared ");
1212 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001213
Victor Stinnera3b334d2011-10-03 13:53:37 +02001214 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001215 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001216 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1217 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001218 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1219 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001220 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001221 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001222}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001223#endif
1224
1225PyObject *
1226PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1227{
1228 PyObject *obj;
1229 PyCompactUnicodeObject *unicode;
1230 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001231 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001232 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233 Py_ssize_t char_size;
1234 Py_ssize_t struct_size;
1235
1236 /* Optimization for empty strings */
1237 if (size == 0 && unicode_empty != NULL) {
1238 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001239 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001240 }
1241
Victor Stinner9e9d6892011-10-04 01:02:02 +02001242 is_ascii = 0;
1243 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001244 struct_size = sizeof(PyCompactUnicodeObject);
1245 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001246 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001247 char_size = 1;
1248 is_ascii = 1;
1249 struct_size = sizeof(PyASCIIObject);
1250 }
1251 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001252 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001253 char_size = 1;
1254 }
1255 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001256 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001257 char_size = 2;
1258 if (sizeof(wchar_t) == 2)
1259 is_sharing = 1;
1260 }
1261 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001262 if (maxchar > MAX_UNICODE) {
1263 PyErr_SetString(PyExc_SystemError,
1264 "invalid maximum character passed to PyUnicode_New");
1265 return NULL;
1266 }
Victor Stinner8f825062012-04-27 13:55:39 +02001267 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001268 char_size = 4;
1269 if (sizeof(wchar_t) == 4)
1270 is_sharing = 1;
1271 }
1272
1273 /* Ensure we won't overflow the size. */
1274 if (size < 0) {
1275 PyErr_SetString(PyExc_SystemError,
1276 "Negative size passed to PyUnicode_New");
1277 return NULL;
1278 }
1279 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1280 return PyErr_NoMemory();
1281
1282 /* Duplicated allocation code from _PyObject_New() instead of a call to
1283 * PyObject_New() so we are able to allocate space for the object and
1284 * it's data buffer.
1285 */
1286 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1287 if (obj == NULL)
1288 return PyErr_NoMemory();
1289 obj = PyObject_INIT(obj, &PyUnicode_Type);
1290 if (obj == NULL)
1291 return NULL;
1292
1293 unicode = (PyCompactUnicodeObject *)obj;
1294 if (is_ascii)
1295 data = ((PyASCIIObject*)obj) + 1;
1296 else
1297 data = unicode + 1;
1298 _PyUnicode_LENGTH(unicode) = size;
1299 _PyUnicode_HASH(unicode) = -1;
1300 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001301 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001302 _PyUnicode_STATE(unicode).compact = 1;
1303 _PyUnicode_STATE(unicode).ready = 1;
1304 _PyUnicode_STATE(unicode).ascii = is_ascii;
1305 if (is_ascii) {
1306 ((char*)data)[size] = 0;
1307 _PyUnicode_WSTR(unicode) = NULL;
1308 }
Victor Stinner8f825062012-04-27 13:55:39 +02001309 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310 ((char*)data)[size] = 0;
1311 _PyUnicode_WSTR(unicode) = NULL;
1312 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001313 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001314 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001315 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 else {
1317 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001318 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001319 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001321 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322 ((Py_UCS4*)data)[size] = 0;
1323 if (is_sharing) {
1324 _PyUnicode_WSTR_LENGTH(unicode) = size;
1325 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1326 }
1327 else {
1328 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1329 _PyUnicode_WSTR(unicode) = NULL;
1330 }
1331 }
Victor Stinner8f825062012-04-27 13:55:39 +02001332#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001333 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001334#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001335 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001336 return obj;
1337}
1338
1339#if SIZEOF_WCHAR_T == 2
1340/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1341 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001342 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001343
1344 This function assumes that unicode can hold one more code point than wstr
1345 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001346static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001347unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001348 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349{
1350 const wchar_t *iter;
1351 Py_UCS4 *ucs4_out;
1352
Victor Stinner910337b2011-10-03 03:20:16 +02001353 assert(unicode != NULL);
1354 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001355 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1356 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1357
1358 for (iter = begin; iter < end; ) {
1359 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1360 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001361 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1362 && (iter+1) < end
1363 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001364 {
Victor Stinner551ac952011-11-29 22:58:13 +01001365 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001366 iter += 2;
1367 }
1368 else {
1369 *ucs4_out++ = *iter;
1370 iter++;
1371 }
1372 }
1373 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1374 _PyUnicode_GET_LENGTH(unicode)));
1375
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376}
1377#endif
1378
Victor Stinnercd9950f2011-10-02 00:34:53 +02001379static int
Victor Stinner488fa492011-12-12 00:01:39 +01001380unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001381{
Victor Stinner488fa492011-12-12 00:01:39 +01001382 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001383 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001384 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001385 return -1;
1386 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001387 return 0;
1388}
1389
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001390static int
1391_copy_characters(PyObject *to, Py_ssize_t to_start,
1392 PyObject *from, Py_ssize_t from_start,
1393 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001395 unsigned int from_kind, to_kind;
1396 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001397
Victor Stinneree4544c2012-05-09 22:24:08 +02001398 assert(0 <= how_many);
1399 assert(0 <= from_start);
1400 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001401 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001402 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001403 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404
Victor Stinnerd3f08822012-05-29 12:57:52 +02001405 assert(PyUnicode_Check(to));
1406 assert(PyUnicode_IS_READY(to));
1407 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1408
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001409 if (how_many == 0)
1410 return 0;
1411
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001413 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001415 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416
Victor Stinnerf1852262012-06-16 16:38:26 +02001417#ifdef Py_DEBUG
1418 if (!check_maxchar
1419 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1420 {
1421 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1422 Py_UCS4 ch;
1423 Py_ssize_t i;
1424 for (i=0; i < how_many; i++) {
1425 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1426 assert(ch <= to_maxchar);
1427 }
1428 }
1429#endif
1430
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001431 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001432 if (check_maxchar
1433 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1434 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001435 /* Writing Latin-1 characters into an ASCII string requires to
1436 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001437 Py_UCS4 max_char;
1438 max_char = ucs1lib_find_max_char(from_data,
1439 (Py_UCS1*)from_data + how_many);
1440 if (max_char >= 128)
1441 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001442 }
Christian Heimesf051e432016-09-13 20:22:02 +02001443 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001444 (char*)from_data + from_kind * from_start,
1445 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001446 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001447 else if (from_kind == PyUnicode_1BYTE_KIND
1448 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001449 {
1450 _PyUnicode_CONVERT_BYTES(
1451 Py_UCS1, Py_UCS2,
1452 PyUnicode_1BYTE_DATA(from) + from_start,
1453 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1454 PyUnicode_2BYTE_DATA(to) + to_start
1455 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001456 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001457 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001458 && to_kind == PyUnicode_4BYTE_KIND)
1459 {
1460 _PyUnicode_CONVERT_BYTES(
1461 Py_UCS1, Py_UCS4,
1462 PyUnicode_1BYTE_DATA(from) + from_start,
1463 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1464 PyUnicode_4BYTE_DATA(to) + to_start
1465 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001466 }
1467 else if (from_kind == PyUnicode_2BYTE_KIND
1468 && to_kind == PyUnicode_4BYTE_KIND)
1469 {
1470 _PyUnicode_CONVERT_BYTES(
1471 Py_UCS2, Py_UCS4,
1472 PyUnicode_2BYTE_DATA(from) + from_start,
1473 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1474 PyUnicode_4BYTE_DATA(to) + to_start
1475 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001476 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001477 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001478 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1479
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001480 if (!check_maxchar) {
1481 if (from_kind == PyUnicode_2BYTE_KIND
1482 && to_kind == PyUnicode_1BYTE_KIND)
1483 {
1484 _PyUnicode_CONVERT_BYTES(
1485 Py_UCS2, Py_UCS1,
1486 PyUnicode_2BYTE_DATA(from) + from_start,
1487 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1488 PyUnicode_1BYTE_DATA(to) + to_start
1489 );
1490 }
1491 else if (from_kind == PyUnicode_4BYTE_KIND
1492 && to_kind == PyUnicode_1BYTE_KIND)
1493 {
1494 _PyUnicode_CONVERT_BYTES(
1495 Py_UCS4, Py_UCS1,
1496 PyUnicode_4BYTE_DATA(from) + from_start,
1497 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1498 PyUnicode_1BYTE_DATA(to) + to_start
1499 );
1500 }
1501 else if (from_kind == PyUnicode_4BYTE_KIND
1502 && to_kind == PyUnicode_2BYTE_KIND)
1503 {
1504 _PyUnicode_CONVERT_BYTES(
1505 Py_UCS4, Py_UCS2,
1506 PyUnicode_4BYTE_DATA(from) + from_start,
1507 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1508 PyUnicode_2BYTE_DATA(to) + to_start
1509 );
1510 }
1511 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001512 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001513 }
1514 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001515 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001516 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001517 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001518 Py_ssize_t i;
1519
Victor Stinnera0702ab2011-09-29 14:14:38 +02001520 for (i=0; i < how_many; i++) {
1521 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001522 if (ch > to_maxchar)
1523 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001524 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1525 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001526 }
1527 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001528 return 0;
1529}
1530
Victor Stinnerd3f08822012-05-29 12:57:52 +02001531void
1532_PyUnicode_FastCopyCharacters(
1533 PyObject *to, Py_ssize_t to_start,
1534 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001535{
1536 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1537}
1538
1539Py_ssize_t
1540PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1541 PyObject *from, Py_ssize_t from_start,
1542 Py_ssize_t how_many)
1543{
1544 int err;
1545
1546 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1547 PyErr_BadInternalCall();
1548 return -1;
1549 }
1550
Benjamin Petersonbac79492012-01-14 13:34:47 -05001551 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001552 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001553 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001554 return -1;
1555
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001556 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001557 PyErr_SetString(PyExc_IndexError, "string index out of range");
1558 return -1;
1559 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001560 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001561 PyErr_SetString(PyExc_IndexError, "string index out of range");
1562 return -1;
1563 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001564 if (how_many < 0) {
1565 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1566 return -1;
1567 }
1568 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001569 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1570 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001571 "Cannot write %zi characters at %zi "
1572 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001573 how_many, to_start, PyUnicode_GET_LENGTH(to));
1574 return -1;
1575 }
1576
1577 if (how_many == 0)
1578 return 0;
1579
Victor Stinner488fa492011-12-12 00:01:39 +01001580 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001581 return -1;
1582
1583 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1584 if (err) {
1585 PyErr_Format(PyExc_SystemError,
1586 "Cannot copy %s characters "
1587 "into a string of %s characters",
1588 unicode_kind_name(from),
1589 unicode_kind_name(to));
1590 return -1;
1591 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001592 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001593}
1594
Victor Stinner17222162011-09-28 22:15:37 +02001595/* Find the maximum code point and count the number of surrogate pairs so a
1596 correct string length can be computed before converting a string to UCS4.
1597 This function counts single surrogates as a character and not as a pair.
1598
1599 Return 0 on success, or -1 on error. */
1600static int
1601find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1602 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001603{
1604 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001605 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001606
Victor Stinnerc53be962011-10-02 21:33:54 +02001607 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001608 *num_surrogates = 0;
1609 *maxchar = 0;
1610
1611 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001612#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001613 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1614 && (iter+1) < end
1615 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1616 {
1617 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1618 ++(*num_surrogates);
1619 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001620 }
1621 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001622#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001623 {
1624 ch = *iter;
1625 iter++;
1626 }
1627 if (ch > *maxchar) {
1628 *maxchar = ch;
1629 if (*maxchar > MAX_UNICODE) {
1630 PyErr_Format(PyExc_ValueError,
1631 "character U+%x is not in range [U+0000; U+10ffff]",
1632 ch);
1633 return -1;
1634 }
1635 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001636 }
1637 return 0;
1638}
1639
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001640int
1641_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642{
1643 wchar_t *end;
1644 Py_UCS4 maxchar = 0;
1645 Py_ssize_t num_surrogates;
1646#if SIZEOF_WCHAR_T == 2
1647 Py_ssize_t length_wo_surrogates;
1648#endif
1649
Georg Brandl7597add2011-10-05 16:36:47 +02001650 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001651 strings were created using _PyObject_New() and where no canonical
1652 representation (the str field) has been set yet aka strings
1653 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001654 assert(_PyUnicode_CHECK(unicode));
1655 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001656 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001657 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001658 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001659 /* Actually, it should neither be interned nor be anything else: */
1660 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001661
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001662 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001663 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001664 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001665 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001666
1667 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001668 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1669 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001670 PyErr_NoMemory();
1671 return -1;
1672 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001673 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001674 _PyUnicode_WSTR(unicode), end,
1675 PyUnicode_1BYTE_DATA(unicode));
1676 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1677 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1678 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1679 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001680 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001681 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001682 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001683 }
1684 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001685 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001686 _PyUnicode_UTF8(unicode) = NULL;
1687 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001688 }
1689 PyObject_FREE(_PyUnicode_WSTR(unicode));
1690 _PyUnicode_WSTR(unicode) = NULL;
1691 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1692 }
1693 /* In this case we might have to convert down from 4-byte native
1694 wchar_t to 2-byte unicode. */
1695 else if (maxchar < 65536) {
1696 assert(num_surrogates == 0 &&
1697 "FindMaxCharAndNumSurrogatePairs() messed up");
1698
Victor Stinner506f5922011-09-28 22:34:18 +02001699#if SIZEOF_WCHAR_T == 2
1700 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001701 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001702 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1703 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1704 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001705 _PyUnicode_UTF8(unicode) = NULL;
1706 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001707#else
1708 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001709 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001710 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001711 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001712 PyErr_NoMemory();
1713 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001714 }
Victor Stinner506f5922011-09-28 22:34:18 +02001715 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1716 _PyUnicode_WSTR(unicode), end,
1717 PyUnicode_2BYTE_DATA(unicode));
1718 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1719 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1720 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001721 _PyUnicode_UTF8(unicode) = NULL;
1722 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001723 PyObject_FREE(_PyUnicode_WSTR(unicode));
1724 _PyUnicode_WSTR(unicode) = NULL;
1725 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1726#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001727 }
1728 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1729 else {
1730#if SIZEOF_WCHAR_T == 2
1731 /* in case the native representation is 2-bytes, we need to allocate a
1732 new normalized 4-byte version. */
1733 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001734 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1735 PyErr_NoMemory();
1736 return -1;
1737 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001738 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1739 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001740 PyErr_NoMemory();
1741 return -1;
1742 }
1743 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1744 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001745 _PyUnicode_UTF8(unicode) = NULL;
1746 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001747 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1748 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001749 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 PyObject_FREE(_PyUnicode_WSTR(unicode));
1751 _PyUnicode_WSTR(unicode) = NULL;
1752 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1753#else
1754 assert(num_surrogates == 0);
1755
Victor Stinnerc3c74152011-10-02 20:39:55 +02001756 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001757 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001758 _PyUnicode_UTF8(unicode) = NULL;
1759 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1761#endif
1762 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1763 }
1764 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001765 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766 return 0;
1767}
1768
Alexander Belopolsky40018472011-02-26 01:02:56 +00001769static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001770unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771{
Walter Dörwald16807132007-05-25 13:52:07 +00001772 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001773 case SSTATE_NOT_INTERNED:
1774 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001775
Benjamin Peterson29060642009-01-31 22:14:21 +00001776 case SSTATE_INTERNED_MORTAL:
1777 /* revive dead object temporarily for DelItem */
1778 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001779 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001780 Py_FatalError(
1781 "deletion of interned string failed");
1782 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001783
Benjamin Peterson29060642009-01-31 22:14:21 +00001784 case SSTATE_INTERNED_IMMORTAL:
1785 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001786 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001787
Benjamin Peterson29060642009-01-31 22:14:21 +00001788 default:
1789 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001790 }
1791
Victor Stinner03490912011-10-03 23:45:12 +02001792 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001794 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001795 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001796 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1797 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001798
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001799 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800}
1801
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001802#ifdef Py_DEBUG
1803static int
1804unicode_is_singleton(PyObject *unicode)
1805{
1806 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1807 if (unicode == unicode_empty)
1808 return 1;
1809 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1810 {
1811 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1812 if (ch < 256 && unicode_latin1[ch] == unicode)
1813 return 1;
1814 }
1815 return 0;
1816}
1817#endif
1818
Alexander Belopolsky40018472011-02-26 01:02:56 +00001819static int
Victor Stinner488fa492011-12-12 00:01:39 +01001820unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001821{
Victor Stinner488fa492011-12-12 00:01:39 +01001822 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001823 if (Py_REFCNT(unicode) != 1)
1824 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001825 if (_PyUnicode_HASH(unicode) != -1)
1826 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001827 if (PyUnicode_CHECK_INTERNED(unicode))
1828 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001829 if (!PyUnicode_CheckExact(unicode))
1830 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001831#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001832 /* singleton refcount is greater than 1 */
1833 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001834#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001835 return 1;
1836}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001837
Victor Stinnerfe226c02011-10-03 03:52:20 +02001838static int
1839unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1840{
1841 PyObject *unicode;
1842 Py_ssize_t old_length;
1843
1844 assert(p_unicode != NULL);
1845 unicode = *p_unicode;
1846
1847 assert(unicode != NULL);
1848 assert(PyUnicode_Check(unicode));
1849 assert(0 <= length);
1850
Victor Stinner910337b2011-10-03 03:20:16 +02001851 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001852 old_length = PyUnicode_WSTR_LENGTH(unicode);
1853 else
1854 old_length = PyUnicode_GET_LENGTH(unicode);
1855 if (old_length == length)
1856 return 0;
1857
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001858 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001859 _Py_INCREF_UNICODE_EMPTY();
1860 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001861 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001862 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001863 return 0;
1864 }
1865
Victor Stinner488fa492011-12-12 00:01:39 +01001866 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001867 PyObject *copy = resize_copy(unicode, length);
1868 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001869 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001870 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001871 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001872 }
1873
Victor Stinnerfe226c02011-10-03 03:52:20 +02001874 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001875 PyObject *new_unicode = resize_compact(unicode, length);
1876 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001877 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001878 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001879 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001880 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001881 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001882}
1883
Alexander Belopolsky40018472011-02-26 01:02:56 +00001884int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001885PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001886{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001887 PyObject *unicode;
1888 if (p_unicode == NULL) {
1889 PyErr_BadInternalCall();
1890 return -1;
1891 }
1892 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001893 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001894 {
1895 PyErr_BadInternalCall();
1896 return -1;
1897 }
1898 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001899}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001900
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001901/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001902
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001903 WARNING: The function doesn't copy the terminating null character and
1904 doesn't check the maximum character (may write a latin1 character in an
1905 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001906static void
1907unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1908 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001909{
1910 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1911 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001912 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001913
1914 switch (kind) {
1915 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001916 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001917#ifdef Py_DEBUG
1918 if (PyUnicode_IS_ASCII(unicode)) {
1919 Py_UCS4 maxchar = ucs1lib_find_max_char(
1920 (const Py_UCS1*)str,
1921 (const Py_UCS1*)str + len);
1922 assert(maxchar < 128);
1923 }
1924#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001925 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001926 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001927 }
1928 case PyUnicode_2BYTE_KIND: {
1929 Py_UCS2 *start = (Py_UCS2 *)data + index;
1930 Py_UCS2 *ucs2 = start;
1931 assert(index <= PyUnicode_GET_LENGTH(unicode));
1932
Victor Stinner184252a2012-06-16 02:57:41 +02001933 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001934 *ucs2 = (Py_UCS2)*str;
1935
1936 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001937 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001938 }
1939 default: {
1940 Py_UCS4 *start = (Py_UCS4 *)data + index;
1941 Py_UCS4 *ucs4 = start;
1942 assert(kind == PyUnicode_4BYTE_KIND);
1943 assert(index <= PyUnicode_GET_LENGTH(unicode));
1944
Victor Stinner184252a2012-06-16 02:57:41 +02001945 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001946 *ucs4 = (Py_UCS4)*str;
1947
1948 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001949 }
1950 }
1951}
1952
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001953static PyObject*
1954get_latin1_char(unsigned char ch)
1955{
Victor Stinnera464fc12011-10-02 20:39:30 +02001956 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001957 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001958 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001959 if (!unicode)
1960 return NULL;
1961 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001962 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001963 unicode_latin1[ch] = unicode;
1964 }
1965 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001966 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001967}
1968
Victor Stinner985a82a2014-01-03 12:53:47 +01001969static PyObject*
1970unicode_char(Py_UCS4 ch)
1971{
1972 PyObject *unicode;
1973
1974 assert(ch <= MAX_UNICODE);
1975
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001976 if (ch < 256)
1977 return get_latin1_char(ch);
1978
Victor Stinner985a82a2014-01-03 12:53:47 +01001979 unicode = PyUnicode_New(1, ch);
1980 if (unicode == NULL)
1981 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001982
1983 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1984 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01001985 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001986 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01001987 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1988 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1989 }
1990 assert(_PyUnicode_CheckConsistency(unicode, 1));
1991 return unicode;
1992}
1993
Alexander Belopolsky40018472011-02-26 01:02:56 +00001994PyObject *
1995PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001996{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001997 if (u == NULL)
1998 return (PyObject*)_PyUnicode_New(size);
1999
2000 if (size < 0) {
2001 PyErr_BadInternalCall();
2002 return NULL;
2003 }
2004
2005 return PyUnicode_FromWideChar(u, size);
2006}
2007
2008PyObject *
2009PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2010{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002011 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012 Py_UCS4 maxchar = 0;
2013 Py_ssize_t num_surrogates;
2014
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002015 if (u == NULL && size != 0) {
2016 PyErr_BadInternalCall();
2017 return NULL;
2018 }
2019
2020 if (size == -1) {
2021 size = wcslen(u);
2022 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002023
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002024 /* If the Unicode data is known at construction time, we can apply
2025 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002027 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002028 if (size == 0)
2029 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002030
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002031 /* Single character Unicode objects in the Latin-1 range are
2032 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002033 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002034 return get_latin1_char((unsigned char)*u);
2035
2036 /* If not empty and not single character, copy the Unicode data
2037 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002038 if (find_maxchar_surrogates(u, u + size,
2039 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002040 return NULL;
2041
Victor Stinner8faf8212011-12-08 22:14:11 +01002042 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043 if (!unicode)
2044 return NULL;
2045
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002046 switch (PyUnicode_KIND(unicode)) {
2047 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002048 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002049 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2050 break;
2051 case PyUnicode_2BYTE_KIND:
2052#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002053 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002054#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002055 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002056 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2057#endif
2058 break;
2059 case PyUnicode_4BYTE_KIND:
2060#if SIZEOF_WCHAR_T == 2
2061 /* This is the only case which has to process surrogates, thus
2062 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002063 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002064#else
2065 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002066 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002067#endif
2068 break;
2069 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002070 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002071 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002073 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002074}
2075
Alexander Belopolsky40018472011-02-26 01:02:56 +00002076PyObject *
2077PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002078{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002079 if (size < 0) {
2080 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002081 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002082 return NULL;
2083 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002084 if (u != NULL)
2085 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2086 else
2087 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002088}
2089
Alexander Belopolsky40018472011-02-26 01:02:56 +00002090PyObject *
2091PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002092{
2093 size_t size = strlen(u);
2094 if (size > PY_SSIZE_T_MAX) {
2095 PyErr_SetString(PyExc_OverflowError, "input too long");
2096 return NULL;
2097 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002098 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002099}
2100
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002101PyObject *
2102_PyUnicode_FromId(_Py_Identifier *id)
2103{
2104 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002105 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2106 strlen(id->string),
2107 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002108 if (!id->object)
2109 return NULL;
2110 PyUnicode_InternInPlace(&id->object);
2111 assert(!id->next);
2112 id->next = static_strings;
2113 static_strings = id;
2114 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002115 return id->object;
2116}
2117
2118void
2119_PyUnicode_ClearStaticStrings()
2120{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002121 _Py_Identifier *tmp, *s = static_strings;
2122 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002123 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002124 tmp = s->next;
2125 s->next = NULL;
2126 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002127 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002128 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002129}
2130
Benjamin Peterson0df54292012-03-26 14:50:32 -04002131/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002132
Victor Stinnerd3f08822012-05-29 12:57:52 +02002133PyObject*
2134_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002135{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002136 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002137 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002138 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002139#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002140 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002141#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002142 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002143 }
Victor Stinner785938e2011-12-11 20:09:03 +01002144 unicode = PyUnicode_New(size, 127);
2145 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002146 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002147 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2148 assert(_PyUnicode_CheckConsistency(unicode, 1));
2149 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002150}
2151
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002152static Py_UCS4
2153kind_maxchar_limit(unsigned int kind)
2154{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002155 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002156 case PyUnicode_1BYTE_KIND:
2157 return 0x80;
2158 case PyUnicode_2BYTE_KIND:
2159 return 0x100;
2160 case PyUnicode_4BYTE_KIND:
2161 return 0x10000;
2162 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002163 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002164 }
2165}
2166
Victor Stinner702c7342011-10-05 13:50:52 +02002167static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002168_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002169{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002170 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002171 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002172
Serhiy Storchaka678db842013-01-26 12:16:36 +02002173 if (size == 0)
2174 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002175 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002176 if (size == 1)
2177 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002178
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002179 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002180 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002181 if (!res)
2182 return NULL;
2183 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002184 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002185 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002186}
2187
Victor Stinnere57b1c02011-09-28 22:20:48 +02002188static PyObject*
2189_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002190{
2191 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002192 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002193
Serhiy Storchaka678db842013-01-26 12:16:36 +02002194 if (size == 0)
2195 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002196 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002197 if (size == 1)
2198 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002199
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002200 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002201 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002202 if (!res)
2203 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002204 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002206 else {
2207 _PyUnicode_CONVERT_BYTES(
2208 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2209 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002210 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002211 return res;
2212}
2213
Victor Stinnere57b1c02011-09-28 22:20:48 +02002214static PyObject*
2215_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216{
2217 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002218 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002219
Serhiy Storchaka678db842013-01-26 12:16:36 +02002220 if (size == 0)
2221 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002222 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002223 if (size == 1)
2224 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002225
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002226 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002227 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002228 if (!res)
2229 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002230 if (max_char < 256)
2231 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2232 PyUnicode_1BYTE_DATA(res));
2233 else if (max_char < 0x10000)
2234 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2235 PyUnicode_2BYTE_DATA(res));
2236 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002237 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002238 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002239 return res;
2240}
2241
2242PyObject*
2243PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2244{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002245 if (size < 0) {
2246 PyErr_SetString(PyExc_ValueError, "size must be positive");
2247 return NULL;
2248 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002249 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002250 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002251 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002253 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002254 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002255 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002256 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002257 PyErr_SetString(PyExc_SystemError, "invalid kind");
2258 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002259 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002260}
2261
Victor Stinnerece58de2012-04-23 23:36:38 +02002262Py_UCS4
2263_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2264{
2265 enum PyUnicode_Kind kind;
2266 void *startptr, *endptr;
2267
2268 assert(PyUnicode_IS_READY(unicode));
2269 assert(0 <= start);
2270 assert(end <= PyUnicode_GET_LENGTH(unicode));
2271 assert(start <= end);
2272
2273 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2274 return PyUnicode_MAX_CHAR_VALUE(unicode);
2275
2276 if (start == end)
2277 return 127;
2278
Victor Stinner94d558b2012-04-27 22:26:58 +02002279 if (PyUnicode_IS_ASCII(unicode))
2280 return 127;
2281
Victor Stinnerece58de2012-04-23 23:36:38 +02002282 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002283 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002284 endptr = (char *)startptr + end * kind;
2285 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002286 switch(kind) {
2287 case PyUnicode_1BYTE_KIND:
2288 return ucs1lib_find_max_char(startptr, endptr);
2289 case PyUnicode_2BYTE_KIND:
2290 return ucs2lib_find_max_char(startptr, endptr);
2291 case PyUnicode_4BYTE_KIND:
2292 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002293 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002294 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002295 }
2296}
2297
Victor Stinner25a4b292011-10-06 12:31:55 +02002298/* Ensure that a string uses the most efficient storage, if it is not the
2299 case: create a new string with of the right kind. Write NULL into *p_unicode
2300 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002301static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002302unicode_adjust_maxchar(PyObject **p_unicode)
2303{
2304 PyObject *unicode, *copy;
2305 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002306 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002307 unsigned int kind;
2308
2309 assert(p_unicode != NULL);
2310 unicode = *p_unicode;
2311 assert(PyUnicode_IS_READY(unicode));
2312 if (PyUnicode_IS_ASCII(unicode))
2313 return;
2314
2315 len = PyUnicode_GET_LENGTH(unicode);
2316 kind = PyUnicode_KIND(unicode);
2317 if (kind == PyUnicode_1BYTE_KIND) {
2318 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002319 max_char = ucs1lib_find_max_char(u, u + len);
2320 if (max_char >= 128)
2321 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002322 }
2323 else if (kind == PyUnicode_2BYTE_KIND) {
2324 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002325 max_char = ucs2lib_find_max_char(u, u + len);
2326 if (max_char >= 256)
2327 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002328 }
2329 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002330 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002331 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002332 max_char = ucs4lib_find_max_char(u, u + len);
2333 if (max_char >= 0x10000)
2334 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002335 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002336 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002337 if (copy != NULL)
2338 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002339 Py_DECREF(unicode);
2340 *p_unicode = copy;
2341}
2342
Victor Stinner034f6cf2011-09-30 02:26:44 +02002343PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002344_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002345{
Victor Stinner87af4f22011-11-21 23:03:47 +01002346 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002347 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002348
Victor Stinner034f6cf2011-09-30 02:26:44 +02002349 if (!PyUnicode_Check(unicode)) {
2350 PyErr_BadInternalCall();
2351 return NULL;
2352 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002353 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002354 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002355
Victor Stinner87af4f22011-11-21 23:03:47 +01002356 length = PyUnicode_GET_LENGTH(unicode);
2357 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002358 if (!copy)
2359 return NULL;
2360 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2361
Christian Heimesf051e432016-09-13 20:22:02 +02002362 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002363 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002364 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002365 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002366}
2367
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002368
Victor Stinnerbc603d12011-10-02 01:00:40 +02002369/* Widen Unicode objects to larger buffers. Don't write terminating null
2370 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002371
2372void*
2373_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2374{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002375 Py_ssize_t len;
2376 void *result;
2377 unsigned int skind;
2378
Benjamin Petersonbac79492012-01-14 13:34:47 -05002379 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002380 return NULL;
2381
2382 len = PyUnicode_GET_LENGTH(s);
2383 skind = PyUnicode_KIND(s);
2384 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002385 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002386 return NULL;
2387 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002388 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002389 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002390 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002391 if (!result)
2392 return PyErr_NoMemory();
2393 assert(skind == PyUnicode_1BYTE_KIND);
2394 _PyUnicode_CONVERT_BYTES(
2395 Py_UCS1, Py_UCS2,
2396 PyUnicode_1BYTE_DATA(s),
2397 PyUnicode_1BYTE_DATA(s) + len,
2398 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002399 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002400 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002401 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002402 if (!result)
2403 return PyErr_NoMemory();
2404 if (skind == PyUnicode_2BYTE_KIND) {
2405 _PyUnicode_CONVERT_BYTES(
2406 Py_UCS2, Py_UCS4,
2407 PyUnicode_2BYTE_DATA(s),
2408 PyUnicode_2BYTE_DATA(s) + len,
2409 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002410 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002411 else {
2412 assert(skind == PyUnicode_1BYTE_KIND);
2413 _PyUnicode_CONVERT_BYTES(
2414 Py_UCS1, Py_UCS4,
2415 PyUnicode_1BYTE_DATA(s),
2416 PyUnicode_1BYTE_DATA(s) + len,
2417 result);
2418 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002419 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002420 default:
2421 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002422 }
Victor Stinner01698042011-10-04 00:04:26 +02002423 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002424 return NULL;
2425}
2426
2427static Py_UCS4*
2428as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2429 int copy_null)
2430{
2431 int kind;
2432 void *data;
2433 Py_ssize_t len, targetlen;
2434 if (PyUnicode_READY(string) == -1)
2435 return NULL;
2436 kind = PyUnicode_KIND(string);
2437 data = PyUnicode_DATA(string);
2438 len = PyUnicode_GET_LENGTH(string);
2439 targetlen = len;
2440 if (copy_null)
2441 targetlen++;
2442 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002443 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002444 if (!target) {
2445 PyErr_NoMemory();
2446 return NULL;
2447 }
2448 }
2449 else {
2450 if (targetsize < targetlen) {
2451 PyErr_Format(PyExc_SystemError,
2452 "string is longer than the buffer");
2453 if (copy_null && 0 < targetsize)
2454 target[0] = 0;
2455 return NULL;
2456 }
2457 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002458 if (kind == PyUnicode_1BYTE_KIND) {
2459 Py_UCS1 *start = (Py_UCS1 *) data;
2460 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002461 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002462 else if (kind == PyUnicode_2BYTE_KIND) {
2463 Py_UCS2 *start = (Py_UCS2 *) data;
2464 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2465 }
2466 else {
2467 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002468 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002469 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002470 if (copy_null)
2471 target[len] = 0;
2472 return target;
2473}
2474
2475Py_UCS4*
2476PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2477 int copy_null)
2478{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002479 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002480 PyErr_BadInternalCall();
2481 return NULL;
2482 }
2483 return as_ucs4(string, target, targetsize, copy_null);
2484}
2485
2486Py_UCS4*
2487PyUnicode_AsUCS4Copy(PyObject *string)
2488{
2489 return as_ucs4(string, NULL, 0, 1);
2490}
2491
Victor Stinner15a11362012-10-06 23:48:20 +02002492/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002493 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2494 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2495#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002496
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002497static int
2498unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2499 Py_ssize_t width, Py_ssize_t precision)
2500{
2501 Py_ssize_t length, fill, arglen;
2502 Py_UCS4 maxchar;
2503
2504 if (PyUnicode_READY(str) == -1)
2505 return -1;
2506
2507 length = PyUnicode_GET_LENGTH(str);
2508 if ((precision == -1 || precision >= length)
2509 && width <= length)
2510 return _PyUnicodeWriter_WriteStr(writer, str);
2511
2512 if (precision != -1)
2513 length = Py_MIN(precision, length);
2514
2515 arglen = Py_MAX(length, width);
2516 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2517 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2518 else
2519 maxchar = writer->maxchar;
2520
2521 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2522 return -1;
2523
2524 if (width > length) {
2525 fill = width - length;
2526 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2527 return -1;
2528 writer->pos += fill;
2529 }
2530
2531 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2532 str, 0, length);
2533 writer->pos += length;
2534 return 0;
2535}
2536
2537static int
Victor Stinner998b8062018-09-12 00:23:25 +02002538unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002539 Py_ssize_t width, Py_ssize_t precision)
2540{
2541 /* UTF-8 */
2542 Py_ssize_t length;
2543 PyObject *unicode;
2544 int res;
2545
2546 length = strlen(str);
2547 if (precision != -1)
2548 length = Py_MIN(length, precision);
2549 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2550 if (unicode == NULL)
2551 return -1;
2552
2553 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2554 Py_DECREF(unicode);
2555 return res;
2556}
2557
Victor Stinner96865452011-03-01 23:44:09 +00002558static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002559unicode_fromformat_arg(_PyUnicodeWriter *writer,
2560 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002561{
Victor Stinnere215d962012-10-06 23:03:36 +02002562 const char *p;
2563 Py_ssize_t len;
2564 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002565 Py_ssize_t width;
2566 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002567 int longflag;
2568 int longlongflag;
2569 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002570 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002571
2572 p = f;
2573 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002574 zeropad = 0;
2575 if (*f == '0') {
2576 zeropad = 1;
2577 f++;
2578 }
Victor Stinner96865452011-03-01 23:44:09 +00002579
2580 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002581 width = -1;
2582 if (Py_ISDIGIT((unsigned)*f)) {
2583 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002584 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002585 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002586 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002587 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002588 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002589 return NULL;
2590 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002591 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002592 f++;
2593 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002594 }
2595 precision = -1;
2596 if (*f == '.') {
2597 f++;
2598 if (Py_ISDIGIT((unsigned)*f)) {
2599 precision = (*f - '0');
2600 f++;
2601 while (Py_ISDIGIT((unsigned)*f)) {
2602 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2603 PyErr_SetString(PyExc_ValueError,
2604 "precision too big");
2605 return NULL;
2606 }
2607 precision = (precision * 10) + (*f - '0');
2608 f++;
2609 }
2610 }
Victor Stinner96865452011-03-01 23:44:09 +00002611 if (*f == '%') {
2612 /* "%.3%s" => f points to "3" */
2613 f--;
2614 }
2615 }
2616 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002617 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002618 f--;
2619 }
Victor Stinner96865452011-03-01 23:44:09 +00002620
2621 /* Handle %ld, %lu, %lld and %llu. */
2622 longflag = 0;
2623 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002624 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002625 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002626 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002627 longflag = 1;
2628 ++f;
2629 }
Victor Stinner96865452011-03-01 23:44:09 +00002630 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002631 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002632 longlongflag = 1;
2633 f += 2;
2634 }
Victor Stinner96865452011-03-01 23:44:09 +00002635 }
2636 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002637 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002638 size_tflag = 1;
2639 ++f;
2640 }
Victor Stinnere215d962012-10-06 23:03:36 +02002641
2642 if (f[1] == '\0')
2643 writer->overallocate = 0;
2644
2645 switch (*f) {
2646 case 'c':
2647 {
2648 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002649 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002650 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002651 "character argument not in range(0x110000)");
2652 return NULL;
2653 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002654 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002655 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002656 break;
2657 }
2658
2659 case 'i':
2660 case 'd':
2661 case 'u':
2662 case 'x':
2663 {
2664 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002665 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002666 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002667
2668 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002669 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002670 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002671 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002672 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002673 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002674 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002675 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002676 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002677 va_arg(*vargs, size_t));
2678 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002679 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002680 va_arg(*vargs, unsigned int));
2681 }
2682 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002683 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002684 }
2685 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002686 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002687 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002688 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002689 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002690 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002691 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002692 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002693 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002694 va_arg(*vargs, Py_ssize_t));
2695 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002696 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002697 va_arg(*vargs, int));
2698 }
2699 assert(len >= 0);
2700
Victor Stinnere215d962012-10-06 23:03:36 +02002701 if (precision < len)
2702 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002703
2704 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002705 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2706 return NULL;
2707
Victor Stinnere215d962012-10-06 23:03:36 +02002708 if (width > precision) {
2709 Py_UCS4 fillchar;
2710 fill = width - precision;
2711 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002712 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2713 return NULL;
2714 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002715 }
Victor Stinner15a11362012-10-06 23:48:20 +02002716 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002717 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002718 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2719 return NULL;
2720 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002721 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002722
Victor Stinner4a587072013-11-19 12:54:53 +01002723 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2724 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002725 break;
2726 }
2727
2728 case 'p':
2729 {
2730 char number[MAX_LONG_LONG_CHARS];
2731
2732 len = sprintf(number, "%p", va_arg(*vargs, void*));
2733 assert(len >= 0);
2734
2735 /* %p is ill-defined: ensure leading 0x. */
2736 if (number[1] == 'X')
2737 number[1] = 'x';
2738 else if (number[1] != 'x') {
2739 memmove(number + 2, number,
2740 strlen(number) + 1);
2741 number[0] = '0';
2742 number[1] = 'x';
2743 len += 2;
2744 }
2745
Victor Stinner4a587072013-11-19 12:54:53 +01002746 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002747 return NULL;
2748 break;
2749 }
2750
2751 case 's':
2752 {
2753 /* UTF-8 */
2754 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002755 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002756 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002757 break;
2758 }
2759
2760 case 'U':
2761 {
2762 PyObject *obj = va_arg(*vargs, PyObject *);
2763 assert(obj && _PyUnicode_CHECK(obj));
2764
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002765 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002766 return NULL;
2767 break;
2768 }
2769
2770 case 'V':
2771 {
2772 PyObject *obj = va_arg(*vargs, PyObject *);
2773 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002774 if (obj) {
2775 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002776 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002777 return NULL;
2778 }
2779 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002780 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002781 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002782 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002783 }
2784 break;
2785 }
2786
2787 case 'S':
2788 {
2789 PyObject *obj = va_arg(*vargs, PyObject *);
2790 PyObject *str;
2791 assert(obj);
2792 str = PyObject_Str(obj);
2793 if (!str)
2794 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002795 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002796 Py_DECREF(str);
2797 return NULL;
2798 }
2799 Py_DECREF(str);
2800 break;
2801 }
2802
2803 case 'R':
2804 {
2805 PyObject *obj = va_arg(*vargs, PyObject *);
2806 PyObject *repr;
2807 assert(obj);
2808 repr = PyObject_Repr(obj);
2809 if (!repr)
2810 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002811 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002812 Py_DECREF(repr);
2813 return NULL;
2814 }
2815 Py_DECREF(repr);
2816 break;
2817 }
2818
2819 case 'A':
2820 {
2821 PyObject *obj = va_arg(*vargs, PyObject *);
2822 PyObject *ascii;
2823 assert(obj);
2824 ascii = PyObject_ASCII(obj);
2825 if (!ascii)
2826 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002827 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002828 Py_DECREF(ascii);
2829 return NULL;
2830 }
2831 Py_DECREF(ascii);
2832 break;
2833 }
2834
2835 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002836 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002837 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002838 break;
2839
2840 default:
2841 /* if we stumble upon an unknown formatting code, copy the rest
2842 of the format string to the output string. (we cannot just
2843 skip the code, since there's no way to know what's in the
2844 argument list) */
2845 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002846 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002847 return NULL;
2848 f = p+len;
2849 return f;
2850 }
2851
2852 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002853 return f;
2854}
2855
Walter Dörwaldd2034312007-05-18 16:29:38 +00002856PyObject *
2857PyUnicode_FromFormatV(const char *format, va_list vargs)
2858{
Victor Stinnere215d962012-10-06 23:03:36 +02002859 va_list vargs2;
2860 const char *f;
2861 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002862
Victor Stinner8f674cc2013-04-17 23:02:17 +02002863 _PyUnicodeWriter_Init(&writer);
2864 writer.min_length = strlen(format) + 100;
2865 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002866
Benjamin Peterson0c212142016-09-20 20:39:33 -07002867 // Copy varags to be able to pass a reference to a subfunction.
2868 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002869
2870 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002871 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002872 f = unicode_fromformat_arg(&writer, f, &vargs2);
2873 if (f == NULL)
2874 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002875 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002876 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002877 const char *p;
2878 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002879
Victor Stinnere215d962012-10-06 23:03:36 +02002880 p = f;
2881 do
2882 {
2883 if ((unsigned char)*p > 127) {
2884 PyErr_Format(PyExc_ValueError,
2885 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2886 "string, got a non-ASCII byte: 0x%02x",
2887 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002888 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002889 }
2890 p++;
2891 }
2892 while (*p != '\0' && *p != '%');
2893 len = p - f;
2894
2895 if (*p == '\0')
2896 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002897
2898 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002899 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002900
2901 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002902 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002903 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002904 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002905 return _PyUnicodeWriter_Finish(&writer);
2906
2907 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002908 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002909 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002910 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002911}
2912
Walter Dörwaldd2034312007-05-18 16:29:38 +00002913PyObject *
2914PyUnicode_FromFormat(const char *format, ...)
2915{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002916 PyObject* ret;
2917 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002918
2919#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002920 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002921#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002922 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002923#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002924 ret = PyUnicode_FromFormatV(format, vargs);
2925 va_end(vargs);
2926 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002927}
2928
Serhiy Storchakac46db922018-10-23 22:58:24 +03002929static Py_ssize_t
2930unicode_get_widechar_size(PyObject *unicode)
2931{
2932 Py_ssize_t res;
2933
2934 assert(unicode != NULL);
2935 assert(_PyUnicode_CHECK(unicode));
2936
2937 if (_PyUnicode_WSTR(unicode) != NULL) {
2938 return PyUnicode_WSTR_LENGTH(unicode);
2939 }
2940 assert(PyUnicode_IS_READY(unicode));
2941
2942 res = _PyUnicode_LENGTH(unicode);
2943#if SIZEOF_WCHAR_T == 2
2944 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
2945 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
2946 const Py_UCS4 *end = s + res;
2947 for (; s < end; ++s) {
2948 if (*s > 0xFFFF) {
2949 ++res;
2950 }
2951 }
2952 }
2953#endif
2954 return res;
2955}
2956
2957static void
2958unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
2959{
2960 const wchar_t *wstr;
2961
2962 assert(unicode != NULL);
2963 assert(_PyUnicode_CHECK(unicode));
2964
2965 wstr = _PyUnicode_WSTR(unicode);
2966 if (wstr != NULL) {
2967 memcpy(w, wstr, size * sizeof(wchar_t));
2968 return;
2969 }
2970 assert(PyUnicode_IS_READY(unicode));
2971
2972 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
2973 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
2974 for (; size--; ++s, ++w) {
2975 *w = *s;
2976 }
2977 }
2978 else {
2979#if SIZEOF_WCHAR_T == 4
2980 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
2981 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
2982 for (; size--; ++s, ++w) {
2983 *w = *s;
2984 }
2985#else
2986 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2987 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
2988 for (; size--; ++s, ++w) {
2989 Py_UCS4 ch = *s;
2990 if (ch > 0xFFFF) {
2991 assert(ch <= MAX_UNICODE);
2992 /* encode surrogate pair in this case */
2993 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
2994 if (!size--)
2995 break;
2996 *w = Py_UNICODE_LOW_SURROGATE(ch);
2997 }
2998 else {
2999 *w = ch;
3000 }
3001 }
3002#endif
3003 }
3004}
3005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003006#ifdef HAVE_WCHAR_H
3007
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003008/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003009
Victor Stinnerd88d9832011-09-06 02:00:05 +02003010 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003011 character) required to convert the unicode object. Ignore size argument.
3012
Victor Stinnerd88d9832011-09-06 02:00:05 +02003013 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003014 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003015 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003016Py_ssize_t
3017PyUnicode_AsWideChar(PyObject *unicode,
3018 wchar_t *w,
3019 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003020{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003021 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003022
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003023 if (unicode == NULL) {
3024 PyErr_BadInternalCall();
3025 return -1;
3026 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003027 if (!PyUnicode_Check(unicode)) {
3028 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003029 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003030 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003031
3032 res = unicode_get_widechar_size(unicode);
3033 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003034 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003035 }
3036
3037 if (size > res) {
3038 size = res + 1;
3039 }
3040 else {
3041 res = size;
3042 }
3043 unicode_copy_as_widechar(unicode, w, size);
3044 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003045}
3046
Victor Stinner137c34c2010-09-29 10:25:54 +00003047wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003048PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003049 Py_ssize_t *size)
3050{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003051 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003052 Py_ssize_t buflen;
3053
3054 if (unicode == NULL) {
3055 PyErr_BadInternalCall();
3056 return NULL;
3057 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003058 if (!PyUnicode_Check(unicode)) {
3059 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003060 return NULL;
3061 }
3062
Serhiy Storchakac46db922018-10-23 22:58:24 +03003063 buflen = unicode_get_widechar_size(unicode);
3064 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003065 if (buffer == NULL) {
3066 PyErr_NoMemory();
3067 return NULL;
3068 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003069 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3070 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003071 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003072 }
3073 else if (wcslen(buffer) != (size_t)buflen) {
3074 PyMem_FREE(buffer);
3075 PyErr_SetString(PyExc_ValueError,
3076 "embedded null character");
3077 return NULL;
3078 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003079 return buffer;
3080}
3081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003082#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003083
Alexander Belopolsky40018472011-02-26 01:02:56 +00003084PyObject *
3085PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003086{
Victor Stinner8faf8212011-12-08 22:14:11 +01003087 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003088 PyErr_SetString(PyExc_ValueError,
3089 "chr() arg not in range(0x110000)");
3090 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003091 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003092
Victor Stinner985a82a2014-01-03 12:53:47 +01003093 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003094}
3095
Alexander Belopolsky40018472011-02-26 01:02:56 +00003096PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003097PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003098{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003099 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003100 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003101 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003102 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003103 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003104 Py_INCREF(obj);
3105 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003106 }
3107 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003108 /* For a Unicode subtype that's not a Unicode object,
3109 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003110 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003111 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003112 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003113 "Can't convert '%.100s' object to str implicitly",
3114 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003115 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003116}
3117
Alexander Belopolsky40018472011-02-26 01:02:56 +00003118PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003119PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003120 const char *encoding,
3121 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003122{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003123 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003124 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003125
Guido van Rossumd57fd912000-03-10 22:53:23 +00003126 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003127 PyErr_BadInternalCall();
3128 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003129 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003130
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003131 /* Decoding bytes objects is the most common case and should be fast */
3132 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003133 if (PyBytes_GET_SIZE(obj) == 0)
3134 _Py_RETURN_UNICODE_EMPTY();
3135 v = PyUnicode_Decode(
3136 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3137 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003138 return v;
3139 }
3140
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003141 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003142 PyErr_SetString(PyExc_TypeError,
3143 "decoding str is not supported");
3144 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003145 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003146
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003147 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3148 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3149 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003150 "decoding to str: need a bytes-like object, %.80s found",
3151 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003152 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003153 }
Tim Petersced69f82003-09-16 20:30:58 +00003154
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003155 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003156 PyBuffer_Release(&buffer);
3157 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003158 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003159
Serhiy Storchaka05997252013-01-26 12:14:02 +02003160 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003161 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003162 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003163}
3164
Victor Stinnerebe17e02016-10-12 13:57:45 +02003165/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3166 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3167 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003168int
3169_Py_normalize_encoding(const char *encoding,
3170 char *lower,
3171 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003172{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003173 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003174 char *l;
3175 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003176 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003177
Victor Stinner942889a2016-09-05 15:40:10 -07003178 assert(encoding != NULL);
3179
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003180 e = encoding;
3181 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003182 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003183 punct = 0;
3184 while (1) {
3185 char c = *e;
3186 if (c == 0) {
3187 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003188 }
Victor Stinner942889a2016-09-05 15:40:10 -07003189
3190 if (Py_ISALNUM(c) || c == '.') {
3191 if (punct && l != lower) {
3192 if (l == l_end) {
3193 return 0;
3194 }
3195 *l++ = '_';
3196 }
3197 punct = 0;
3198
3199 if (l == l_end) {
3200 return 0;
3201 }
3202 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003203 }
3204 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003205 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003206 }
Victor Stinner942889a2016-09-05 15:40:10 -07003207
3208 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003209 }
3210 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003211 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003212}
3213
Alexander Belopolsky40018472011-02-26 01:02:56 +00003214PyObject *
3215PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003216 Py_ssize_t size,
3217 const char *encoding,
3218 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003219{
3220 PyObject *buffer = NULL, *unicode;
3221 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003222 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3223
3224 if (encoding == NULL) {
3225 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3226 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003227
Fred Drakee4315f52000-05-09 19:53:39 +00003228 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003229 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3230 char *lower = buflower;
3231
3232 /* Fast paths */
3233 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3234 lower += 3;
3235 if (*lower == '_') {
3236 /* Match "utf8" and "utf_8" */
3237 lower++;
3238 }
3239
3240 if (lower[0] == '8' && lower[1] == 0) {
3241 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3242 }
3243 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3244 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3245 }
3246 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3247 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3248 }
3249 }
3250 else {
3251 if (strcmp(lower, "ascii") == 0
3252 || strcmp(lower, "us_ascii") == 0) {
3253 return PyUnicode_DecodeASCII(s, size, errors);
3254 }
Steve Dowercc16be82016-09-08 10:35:16 -07003255 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003256 else if (strcmp(lower, "mbcs") == 0) {
3257 return PyUnicode_DecodeMBCS(s, size, errors);
3258 }
3259 #endif
3260 else if (strcmp(lower, "latin1") == 0
3261 || strcmp(lower, "latin_1") == 0
3262 || strcmp(lower, "iso_8859_1") == 0
3263 || strcmp(lower, "iso8859_1") == 0) {
3264 return PyUnicode_DecodeLatin1(s, size, errors);
3265 }
3266 }
Victor Stinner37296e82010-06-10 13:36:23 +00003267 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003268
3269 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003270 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003271 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003272 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003273 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003274 if (buffer == NULL)
3275 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003276 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003277 if (unicode == NULL)
3278 goto onError;
3279 if (!PyUnicode_Check(unicode)) {
3280 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003281 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003282 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003283 encoding,
3284 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003285 Py_DECREF(unicode);
3286 goto onError;
3287 }
3288 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003289 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003290
Benjamin Peterson29060642009-01-31 22:14:21 +00003291 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292 Py_XDECREF(buffer);
3293 return NULL;
3294}
3295
Alexander Belopolsky40018472011-02-26 01:02:56 +00003296PyObject *
3297PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003298 const char *encoding,
3299 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003300{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003301 if (!PyUnicode_Check(unicode)) {
3302 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003303 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003304 }
3305
Serhiy Storchaka00939072016-10-27 21:05:49 +03003306 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3307 "PyUnicode_AsDecodedObject() is deprecated; "
3308 "use PyCodec_Decode() to decode from str", 1) < 0)
3309 return NULL;
3310
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003311 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003312 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003313
3314 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003315 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003316}
3317
Alexander Belopolsky40018472011-02-26 01:02:56 +00003318PyObject *
3319PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003320 const char *encoding,
3321 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003322{
3323 PyObject *v;
3324
3325 if (!PyUnicode_Check(unicode)) {
3326 PyErr_BadArgument();
3327 goto onError;
3328 }
3329
Serhiy Storchaka00939072016-10-27 21:05:49 +03003330 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3331 "PyUnicode_AsDecodedUnicode() is deprecated; "
3332 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3333 return NULL;
3334
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003335 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003336 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003337
3338 /* Decode via the codec registry */
3339 v = PyCodec_Decode(unicode, encoding, errors);
3340 if (v == NULL)
3341 goto onError;
3342 if (!PyUnicode_Check(v)) {
3343 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003344 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003345 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003346 encoding,
3347 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003348 Py_DECREF(v);
3349 goto onError;
3350 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003351 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003352
Benjamin Peterson29060642009-01-31 22:14:21 +00003353 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003354 return NULL;
3355}
3356
Alexander Belopolsky40018472011-02-26 01:02:56 +00003357PyObject *
3358PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003359 Py_ssize_t size,
3360 const char *encoding,
3361 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003362{
3363 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003364
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003365 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003367 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003368 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3369 Py_DECREF(unicode);
3370 return v;
3371}
3372
Alexander Belopolsky40018472011-02-26 01:02:56 +00003373PyObject *
3374PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003375 const char *encoding,
3376 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003377{
3378 PyObject *v;
3379
3380 if (!PyUnicode_Check(unicode)) {
3381 PyErr_BadArgument();
3382 goto onError;
3383 }
3384
Serhiy Storchaka00939072016-10-27 21:05:49 +03003385 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3386 "PyUnicode_AsEncodedObject() is deprecated; "
3387 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3388 "or PyCodec_Encode() for generic encoding", 1) < 0)
3389 return NULL;
3390
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003391 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003392 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003393
3394 /* Encode via the codec registry */
3395 v = PyCodec_Encode(unicode, encoding, errors);
3396 if (v == NULL)
3397 goto onError;
3398 return v;
3399
Benjamin Peterson29060642009-01-31 22:14:21 +00003400 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003401 return NULL;
3402}
3403
Victor Stinner1b579672011-12-17 05:47:23 +01003404
Victor Stinner2cba6b82018-01-10 22:46:15 +01003405static PyObject *
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003406unicode_encode_locale(PyObject *unicode, const char *errors,
3407 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003408{
Victor Stinner3d4226a2018-08-29 22:21:32 +02003409 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003410
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003411 Py_ssize_t wlen;
3412 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3413 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003414 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003415 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003416
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003417 Py_ssize_t wlen2 = wcslen(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003418 if (wlen2 != wlen) {
3419 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003420 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003421 return NULL;
3422 }
3423
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003424 char *str;
3425 size_t error_pos;
3426 const char *reason;
3427 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003428 current_locale, error_handler);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003429 if (res != 0) {
3430 if (res == -2) {
3431 PyObject *exc;
3432 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3433 "locale", unicode,
3434 (Py_ssize_t)error_pos,
3435 (Py_ssize_t)(error_pos+1),
3436 reason);
3437 if (exc != NULL) {
3438 PyCodec_StrictErrors(exc);
3439 Py_DECREF(exc);
3440 }
3441 return NULL;
Victor Stinner2cba6b82018-01-10 22:46:15 +01003442 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003443 else if (res == -3) {
3444 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3445 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003446 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003447 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003448 PyMem_Free(wstr);
3449 return NULL;
3450 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003451 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003452 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003453
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003454 PyObject *bytes = PyBytes_FromString(str);
3455 PyMem_RawFree(str);
3456 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003457}
3458
Victor Stinnerad158722010-10-27 00:25:46 +00003459PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003460PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3461{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003462 return unicode_encode_locale(unicode, errors, 1);
3463}
3464
3465PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003466PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003467{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003468 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003469 const _PyCoreConfig *config = &interp->core_config;
3470#if defined(__APPLE__)
3471 return _PyUnicode_AsUTF8String(unicode, config->filesystem_errors);
3472#else
Victor Stinner793b5312011-04-27 00:24:21 +02003473 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3474 cannot use it to encode and decode filenames before it is loaded. Load
3475 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003476 implementation of the locale codec until the codec registry is
3477 initialized and the Python codec is loaded. See initfsencoding(). */
3478 if (interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003479 return PyUnicode_AsEncodedString(unicode,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003480 config->filesystem_encoding,
3481 config->filesystem_errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003482 }
3483 else {
Victor Stinner2cba6b82018-01-10 22:46:15 +01003484 return unicode_encode_locale(unicode,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003485 config->filesystem_errors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003486 }
Victor Stinnerad158722010-10-27 00:25:46 +00003487#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003488}
3489
Alexander Belopolsky40018472011-02-26 01:02:56 +00003490PyObject *
3491PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003492 const char *encoding,
3493 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003494{
3495 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003496 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003497
Guido van Rossumd57fd912000-03-10 22:53:23 +00003498 if (!PyUnicode_Check(unicode)) {
3499 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003500 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003501 }
Fred Drakee4315f52000-05-09 19:53:39 +00003502
Victor Stinner942889a2016-09-05 15:40:10 -07003503 if (encoding == NULL) {
3504 return _PyUnicode_AsUTF8String(unicode, errors);
3505 }
3506
Fred Drakee4315f52000-05-09 19:53:39 +00003507 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003508 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3509 char *lower = buflower;
3510
3511 /* Fast paths */
3512 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3513 lower += 3;
3514 if (*lower == '_') {
3515 /* Match "utf8" and "utf_8" */
3516 lower++;
3517 }
3518
3519 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003520 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003521 }
3522 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3523 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3524 }
3525 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3526 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3527 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003528 }
Victor Stinner942889a2016-09-05 15:40:10 -07003529 else {
3530 if (strcmp(lower, "ascii") == 0
3531 || strcmp(lower, "us_ascii") == 0) {
3532 return _PyUnicode_AsASCIIString(unicode, errors);
3533 }
Steve Dowercc16be82016-09-08 10:35:16 -07003534#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003535 else if (strcmp(lower, "mbcs") == 0) {
3536 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3537 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003538#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003539 else if (strcmp(lower, "latin1") == 0 ||
3540 strcmp(lower, "latin_1") == 0 ||
3541 strcmp(lower, "iso_8859_1") == 0 ||
3542 strcmp(lower, "iso8859_1") == 0) {
3543 return _PyUnicode_AsLatin1String(unicode, errors);
3544 }
3545 }
Victor Stinner37296e82010-06-10 13:36:23 +00003546 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003547
3548 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003549 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003550 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003551 return NULL;
3552
3553 /* The normal path */
3554 if (PyBytes_Check(v))
3555 return v;
3556
3557 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003558 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003559 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003560 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003561
3562 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003563 "encoder %s returned bytearray instead of bytes; "
3564 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003565 encoding);
3566 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003567 Py_DECREF(v);
3568 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003569 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003570
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003571 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3572 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003573 Py_DECREF(v);
3574 return b;
3575 }
3576
3577 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003578 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003579 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003580 encoding,
3581 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003582 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003583 return NULL;
3584}
3585
Alexander Belopolsky40018472011-02-26 01:02:56 +00003586PyObject *
3587PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003588 const char *encoding,
3589 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003590{
3591 PyObject *v;
3592
3593 if (!PyUnicode_Check(unicode)) {
3594 PyErr_BadArgument();
3595 goto onError;
3596 }
3597
Serhiy Storchaka00939072016-10-27 21:05:49 +03003598 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3599 "PyUnicode_AsEncodedUnicode() is deprecated; "
3600 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3601 return NULL;
3602
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003603 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003604 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003605
3606 /* Encode via the codec registry */
3607 v = PyCodec_Encode(unicode, encoding, errors);
3608 if (v == NULL)
3609 goto onError;
3610 if (!PyUnicode_Check(v)) {
3611 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003612 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003613 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003614 encoding,
3615 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003616 Py_DECREF(v);
3617 goto onError;
3618 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003619 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003620
Benjamin Peterson29060642009-01-31 22:14:21 +00003621 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003622 return NULL;
3623}
3624
Victor Stinner2cba6b82018-01-10 22:46:15 +01003625static PyObject*
3626unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
3627 int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003628{
Victor Stinner3d4226a2018-08-29 22:21:32 +02003629 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003630
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003631 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3632 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003633 return NULL;
3634 }
3635
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003636 wchar_t *wstr;
3637 size_t wlen;
3638 const char *reason;
3639 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003640 current_locale, error_handler);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003641 if (res != 0) {
3642 if (res == -2) {
3643 PyObject *exc;
3644 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3645 "locale", str, len,
3646 (Py_ssize_t)wlen,
3647 (Py_ssize_t)(wlen + 1),
3648 reason);
3649 if (exc != NULL) {
3650 PyCodec_StrictErrors(exc);
3651 Py_DECREF(exc);
3652 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003653 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003654 else if (res == -3) {
3655 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3656 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003657 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003658 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003659 }
Victor Stinner2f197072011-12-17 07:08:30 +01003660 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003661 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003662
3663 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3664 PyMem_RawFree(wstr);
3665 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003666}
3667
3668PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003669PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3670 const char *errors)
3671{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003672 return unicode_decode_locale(str, len, errors, 1);
3673}
3674
3675PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003676PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003677{
3678 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003679 return unicode_decode_locale(str, size, errors, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003680}
3681
3682
3683PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003684PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003685 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003686 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3687}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003688
Christian Heimes5894ba72007-11-04 11:43:14 +00003689PyObject*
3690PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3691{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003692 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003693 const _PyCoreConfig *config = &interp->core_config;
3694#if defined(__APPLE__)
3695 return PyUnicode_DecodeUTF8Stateful(s, size, config->filesystem_errors, NULL);
3696#else
Victor Stinner793b5312011-04-27 00:24:21 +02003697 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3698 cannot use it to encode and decode filenames before it is loaded. Load
3699 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003700 implementation of the locale codec until the codec registry is
3701 initialized and the Python codec is loaded. See initfsencoding(). */
3702 if (interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003703 return PyUnicode_Decode(s, size,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003704 config->filesystem_encoding,
3705 config->filesystem_errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003706 }
3707 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003708 return unicode_decode_locale(s, size,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003709 config->filesystem_errors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003710 }
Victor Stinnerad158722010-10-27 00:25:46 +00003711#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003712}
3713
Martin v. Löwis011e8422009-05-05 04:43:17 +00003714
3715int
3716PyUnicode_FSConverter(PyObject* arg, void* addr)
3717{
Brett Cannonec6ce872016-09-06 15:50:29 -07003718 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003719 PyObject *output = NULL;
3720 Py_ssize_t size;
3721 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003722 if (arg == NULL) {
3723 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003724 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003725 return 1;
3726 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003727 path = PyOS_FSPath(arg);
3728 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003729 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003730 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003731 if (PyBytes_Check(path)) {
3732 output = path;
3733 }
3734 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3735 output = PyUnicode_EncodeFSDefault(path);
3736 Py_DECREF(path);
3737 if (!output) {
3738 return 0;
3739 }
3740 assert(PyBytes_Check(output));
3741 }
3742
Victor Stinner0ea2a462010-04-30 00:22:08 +00003743 size = PyBytes_GET_SIZE(output);
3744 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003745 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003746 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003747 Py_DECREF(output);
3748 return 0;
3749 }
3750 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003751 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003752}
3753
3754
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003755int
3756PyUnicode_FSDecoder(PyObject* arg, void* addr)
3757{
Brett Cannona5711202016-09-06 19:36:01 -07003758 int is_buffer = 0;
3759 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003760 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003761 if (arg == NULL) {
3762 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003763 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003764 return 1;
3765 }
Brett Cannona5711202016-09-06 19:36:01 -07003766
3767 is_buffer = PyObject_CheckBuffer(arg);
3768 if (!is_buffer) {
3769 path = PyOS_FSPath(arg);
3770 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003771 return 0;
3772 }
Brett Cannona5711202016-09-06 19:36:01 -07003773 }
3774 else {
3775 path = arg;
3776 Py_INCREF(arg);
3777 }
3778
3779 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003780 output = path;
3781 }
3782 else if (PyBytes_Check(path) || is_buffer) {
3783 PyObject *path_bytes = NULL;
3784
3785 if (!PyBytes_Check(path) &&
3786 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003787 "path should be string, bytes, or os.PathLike, not %.200s",
3788 Py_TYPE(arg)->tp_name)) {
3789 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003790 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003791 }
3792 path_bytes = PyBytes_FromObject(path);
3793 Py_DECREF(path);
3794 if (!path_bytes) {
3795 return 0;
3796 }
3797 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3798 PyBytes_GET_SIZE(path_bytes));
3799 Py_DECREF(path_bytes);
3800 if (!output) {
3801 return 0;
3802 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003803 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003804 else {
3805 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003806 "path should be string, bytes, or os.PathLike, not %.200s",
3807 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003808 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003809 return 0;
3810 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003811 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003812 Py_DECREF(output);
3813 return 0;
3814 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003815 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003816 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003817 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003818 Py_DECREF(output);
3819 return 0;
3820 }
3821 *(PyObject**)addr = output;
3822 return Py_CLEANUP_SUPPORTED;
3823}
3824
3825
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003826const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003827PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003828{
Christian Heimesf3863112007-11-22 07:46:41 +00003829 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003830
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003831 if (!PyUnicode_Check(unicode)) {
3832 PyErr_BadArgument();
3833 return NULL;
3834 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003835 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003836 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003837
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003838 if (PyUnicode_UTF8(unicode) == NULL) {
3839 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003840 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003841 if (bytes == NULL)
3842 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003843 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3844 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003845 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003846 Py_DECREF(bytes);
3847 return NULL;
3848 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003849 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003850 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003851 PyBytes_AS_STRING(bytes),
3852 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003853 Py_DECREF(bytes);
3854 }
3855
3856 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003857 *psize = PyUnicode_UTF8_LENGTH(unicode);
3858 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003859}
3860
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003861const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003862PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003863{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003864 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3865}
3866
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003867Py_UNICODE *
3868PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3869{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003870 if (!PyUnicode_Check(unicode)) {
3871 PyErr_BadArgument();
3872 return NULL;
3873 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003874 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
3875 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003876 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003877 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003878 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003879
Serhiy Storchakac46db922018-10-23 22:58:24 +03003880 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
3881 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3882 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003883 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003884 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003885 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
3886 if (w == NULL) {
3887 PyErr_NoMemory();
3888 return NULL;
3889 }
3890 unicode_copy_as_widechar(unicode, w, wlen + 1);
3891 _PyUnicode_WSTR(unicode) = w;
3892 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
3893 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003894 }
3895 }
3896 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003897 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003898 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00003899}
3900
Alexander Belopolsky40018472011-02-26 01:02:56 +00003901Py_UNICODE *
3902PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003903{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003904 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003905}
3906
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03003907const Py_UNICODE *
3908_PyUnicode_AsUnicode(PyObject *unicode)
3909{
3910 Py_ssize_t size;
3911 const Py_UNICODE *wstr;
3912
3913 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
3914 if (wstr && wcslen(wstr) != (size_t)size) {
3915 PyErr_SetString(PyExc_ValueError, "embedded null character");
3916 return NULL;
3917 }
3918 return wstr;
3919}
3920
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003921
Alexander Belopolsky40018472011-02-26 01:02:56 +00003922Py_ssize_t
3923PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003924{
3925 if (!PyUnicode_Check(unicode)) {
3926 PyErr_BadArgument();
3927 goto onError;
3928 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003929 if (_PyUnicode_WSTR(unicode) == NULL) {
3930 if (PyUnicode_AsUnicode(unicode) == NULL)
3931 goto onError;
3932 }
3933 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003934
Benjamin Peterson29060642009-01-31 22:14:21 +00003935 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003936 return -1;
3937}
3938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003939Py_ssize_t
3940PyUnicode_GetLength(PyObject *unicode)
3941{
Victor Stinner07621332012-06-16 04:53:46 +02003942 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003943 PyErr_BadArgument();
3944 return -1;
3945 }
Victor Stinner07621332012-06-16 04:53:46 +02003946 if (PyUnicode_READY(unicode) == -1)
3947 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003948 return PyUnicode_GET_LENGTH(unicode);
3949}
3950
3951Py_UCS4
3952PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3953{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003954 void *data;
3955 int kind;
3956
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03003957 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003958 PyErr_BadArgument();
3959 return (Py_UCS4)-1;
3960 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03003961 if (PyUnicode_READY(unicode) == -1) {
3962 return (Py_UCS4)-1;
3963 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003964 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003965 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003966 return (Py_UCS4)-1;
3967 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003968 data = PyUnicode_DATA(unicode);
3969 kind = PyUnicode_KIND(unicode);
3970 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003971}
3972
3973int
3974PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3975{
3976 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003977 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003978 return -1;
3979 }
Victor Stinner488fa492011-12-12 00:01:39 +01003980 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003981 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003982 PyErr_SetString(PyExc_IndexError, "string index out of range");
3983 return -1;
3984 }
Victor Stinner488fa492011-12-12 00:01:39 +01003985 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003986 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003987 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3988 PyErr_SetString(PyExc_ValueError, "character out of range");
3989 return -1;
3990 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003991 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3992 index, ch);
3993 return 0;
3994}
3995
Alexander Belopolsky40018472011-02-26 01:02:56 +00003996const char *
3997PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003998{
Victor Stinner42cb4622010-09-01 19:39:01 +00003999 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004000}
4001
Victor Stinner554f3f02010-06-16 23:33:54 +00004002/* create or adjust a UnicodeDecodeError */
4003static void
4004make_decode_exception(PyObject **exceptionObject,
4005 const char *encoding,
4006 const char *input, Py_ssize_t length,
4007 Py_ssize_t startpos, Py_ssize_t endpos,
4008 const char *reason)
4009{
4010 if (*exceptionObject == NULL) {
4011 *exceptionObject = PyUnicodeDecodeError_Create(
4012 encoding, input, length, startpos, endpos, reason);
4013 }
4014 else {
4015 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4016 goto onError;
4017 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4018 goto onError;
4019 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4020 goto onError;
4021 }
4022 return;
4023
4024onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004025 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004026}
4027
Steve Dowercc16be82016-09-08 10:35:16 -07004028#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004029/* error handling callback helper:
4030 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004031 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004032 and adjust various state variables.
4033 return 0 on success, -1 on error
4034*/
4035
Alexander Belopolsky40018472011-02-26 01:02:56 +00004036static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004037unicode_decode_call_errorhandler_wchar(
4038 const char *errors, PyObject **errorHandler,
4039 const char *encoding, const char *reason,
4040 const char **input, const char **inend, Py_ssize_t *startinpos,
4041 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4042 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004043{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004044 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004045
4046 PyObject *restuple = NULL;
4047 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004048 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004049 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004050 Py_ssize_t requiredsize;
4051 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004052 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004053 wchar_t *repwstr;
4054 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004055
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004056 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4057 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004058
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004059 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004060 *errorHandler = PyCodec_LookupError(errors);
4061 if (*errorHandler == NULL)
4062 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004063 }
4064
Victor Stinner554f3f02010-06-16 23:33:54 +00004065 make_decode_exception(exceptionObject,
4066 encoding,
4067 *input, *inend - *input,
4068 *startinpos, *endinpos,
4069 reason);
4070 if (*exceptionObject == NULL)
4071 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004072
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004073 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004074 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004075 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004076 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004077 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004078 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004079 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004080 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004081 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004082
4083 /* Copy back the bytes variables, which might have been modified by the
4084 callback */
4085 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4086 if (!inputobj)
4087 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004088 *input = PyBytes_AS_STRING(inputobj);
4089 insize = PyBytes_GET_SIZE(inputobj);
4090 *inend = *input + insize;
4091 /* we can DECREF safely, as the exception has another reference,
4092 so the object won't go away. */
4093 Py_DECREF(inputobj);
4094
4095 if (newpos<0)
4096 newpos = insize+newpos;
4097 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004098 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004099 goto onError;
4100 }
4101
4102 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4103 if (repwstr == NULL)
4104 goto onError;
4105 /* need more space? (at least enough for what we
4106 have+the replacement+the rest of the string (starting
4107 at the new input position), so we won't have to check space
4108 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004109 requiredsize = *outpos;
4110 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4111 goto overflow;
4112 requiredsize += repwlen;
4113 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4114 goto overflow;
4115 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004116 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004117 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004118 requiredsize = 2*outsize;
4119 if (unicode_resize(output, requiredsize) < 0)
4120 goto onError;
4121 }
4122 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4123 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004124 *endinpos = newpos;
4125 *inptr = *input + newpos;
4126
4127 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004128 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004129 return 0;
4130
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004131 overflow:
4132 PyErr_SetString(PyExc_OverflowError,
4133 "decoded result is too long for a Python string");
4134
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004135 onError:
4136 Py_XDECREF(restuple);
4137 return -1;
4138}
Steve Dowercc16be82016-09-08 10:35:16 -07004139#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004140
4141static int
4142unicode_decode_call_errorhandler_writer(
4143 const char *errors, PyObject **errorHandler,
4144 const char *encoding, const char *reason,
4145 const char **input, const char **inend, Py_ssize_t *startinpos,
4146 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4147 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4148{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004149 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004150
4151 PyObject *restuple = NULL;
4152 PyObject *repunicode = NULL;
4153 Py_ssize_t insize;
4154 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004155 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004156 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004157 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004158 int need_to_grow = 0;
4159 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004160
4161 if (*errorHandler == NULL) {
4162 *errorHandler = PyCodec_LookupError(errors);
4163 if (*errorHandler == NULL)
4164 goto onError;
4165 }
4166
4167 make_decode_exception(exceptionObject,
4168 encoding,
4169 *input, *inend - *input,
4170 *startinpos, *endinpos,
4171 reason);
4172 if (*exceptionObject == NULL)
4173 goto onError;
4174
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004175 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004176 if (restuple == NULL)
4177 goto onError;
4178 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004179 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004180 goto onError;
4181 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004182 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004183 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004184
4185 /* Copy back the bytes variables, which might have been modified by the
4186 callback */
4187 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4188 if (!inputobj)
4189 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004190 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004191 *input = PyBytes_AS_STRING(inputobj);
4192 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004193 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004194 /* we can DECREF safely, as the exception has another reference,
4195 so the object won't go away. */
4196 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004197
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004198 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004199 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004200 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004201 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004202 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004203 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004204
Victor Stinner170ca6f2013-04-18 00:25:28 +02004205 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004206 if (replen > 1) {
4207 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004208 need_to_grow = 1;
4209 }
4210 new_inptr = *input + newpos;
4211 if (*inend - new_inptr > remain) {
4212 /* We don't know the decoding algorithm here so we make the worst
4213 assumption that one byte decodes to one unicode character.
4214 If unfortunately one byte could decode to more unicode characters,
4215 the decoder may write out-of-bound then. Is it possible for the
4216 algorithms using this function? */
4217 writer->min_length += *inend - new_inptr - remain;
4218 need_to_grow = 1;
4219 }
4220 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004221 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004222 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004223 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4224 goto onError;
4225 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004226 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004227 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004228
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004229 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004230 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004231
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004232 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004233 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004234 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004235
Benjamin Peterson29060642009-01-31 22:14:21 +00004236 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004237 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004238 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004239}
4240
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004241/* --- UTF-7 Codec -------------------------------------------------------- */
4242
Antoine Pitrou244651a2009-05-04 18:56:13 +00004243/* See RFC2152 for details. We encode conservatively and decode liberally. */
4244
4245/* Three simple macros defining base-64. */
4246
4247/* Is c a base-64 character? */
4248
4249#define IS_BASE64(c) \
4250 (((c) >= 'A' && (c) <= 'Z') || \
4251 ((c) >= 'a' && (c) <= 'z') || \
4252 ((c) >= '0' && (c) <= '9') || \
4253 (c) == '+' || (c) == '/')
4254
4255/* given that c is a base-64 character, what is its base-64 value? */
4256
4257#define FROM_BASE64(c) \
4258 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4259 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4260 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4261 (c) == '+' ? 62 : 63)
4262
4263/* What is the base-64 character of the bottom 6 bits of n? */
4264
4265#define TO_BASE64(n) \
4266 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4267
4268/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4269 * decoded as itself. We are permissive on decoding; the only ASCII
4270 * byte not decoding to itself is the + which begins a base64
4271 * string. */
4272
4273#define DECODE_DIRECT(c) \
4274 ((c) <= 127 && (c) != '+')
4275
4276/* The UTF-7 encoder treats ASCII characters differently according to
4277 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4278 * the above). See RFC2152. This array identifies these different
4279 * sets:
4280 * 0 : "Set D"
4281 * alphanumeric and '(),-./:?
4282 * 1 : "Set O"
4283 * !"#$%&*;<=>@[]^_`{|}
4284 * 2 : "whitespace"
4285 * ht nl cr sp
4286 * 3 : special (must be base64 encoded)
4287 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4288 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004289
Tim Petersced69f82003-09-16 20:30:58 +00004290static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004291char utf7_category[128] = {
4292/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4293 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4294/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4295 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4296/* sp ! " # $ % & ' ( ) * + , - . / */
4297 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4298/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4299 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4300/* @ A B C D E F G H I J K L M N O */
4301 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4302/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4303 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4304/* ` a b c d e f g h i j k l m n o */
4305 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4306/* p q r s t u v w x y z { | } ~ del */
4307 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004308};
4309
Antoine Pitrou244651a2009-05-04 18:56:13 +00004310/* ENCODE_DIRECT: this character should be encoded as itself. The
4311 * answer depends on whether we are encoding set O as itself, and also
4312 * on whether we are encoding whitespace as itself. RFC2152 makes it
4313 * clear that the answers to these questions vary between
4314 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004315
Antoine Pitrou244651a2009-05-04 18:56:13 +00004316#define ENCODE_DIRECT(c, directO, directWS) \
4317 ((c) < 128 && (c) > 0 && \
4318 ((utf7_category[(c)] == 0) || \
4319 (directWS && (utf7_category[(c)] == 2)) || \
4320 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004321
Alexander Belopolsky40018472011-02-26 01:02:56 +00004322PyObject *
4323PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004324 Py_ssize_t size,
4325 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004326{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004327 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4328}
4329
Antoine Pitrou244651a2009-05-04 18:56:13 +00004330/* The decoder. The only state we preserve is our read position,
4331 * i.e. how many characters we have consumed. So if we end in the
4332 * middle of a shift sequence we have to back off the read position
4333 * and the output to the beginning of the sequence, otherwise we lose
4334 * all the shift state (seen bits, number of bits seen, high
4335 * surrogate). */
4336
Alexander Belopolsky40018472011-02-26 01:02:56 +00004337PyObject *
4338PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004339 Py_ssize_t size,
4340 const char *errors,
4341 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004342{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004343 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004344 Py_ssize_t startinpos;
4345 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004346 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004347 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004348 const char *errmsg = "";
4349 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004350 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004351 unsigned int base64bits = 0;
4352 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004353 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004354 PyObject *errorHandler = NULL;
4355 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004356
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004357 if (size == 0) {
4358 if (consumed)
4359 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004360 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004361 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004362
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004363 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004364 _PyUnicodeWriter_Init(&writer);
4365 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004366
4367 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004368 e = s + size;
4369
4370 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004371 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004372 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004373 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004374
Antoine Pitrou244651a2009-05-04 18:56:13 +00004375 if (inShift) { /* in a base-64 section */
4376 if (IS_BASE64(ch)) { /* consume a base-64 character */
4377 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4378 base64bits += 6;
4379 s++;
4380 if (base64bits >= 16) {
4381 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004382 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004383 base64bits -= 16;
4384 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004385 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004386 if (surrogate) {
4387 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004388 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4389 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004390 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004391 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004392 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004393 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004394 }
4395 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004396 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004397 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004398 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004399 }
4400 }
Victor Stinner551ac952011-11-29 22:58:13 +01004401 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004402 /* first surrogate */
4403 surrogate = outCh;
4404 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004405 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004406 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004407 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004408 }
4409 }
4410 }
4411 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004412 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004413 if (base64bits > 0) { /* left-over bits */
4414 if (base64bits >= 6) {
4415 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004416 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004417 errmsg = "partial character in shift sequence";
4418 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004419 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004420 else {
4421 /* Some bits remain; they should be zero */
4422 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004423 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004424 errmsg = "non-zero padding bits in shift sequence";
4425 goto utf7Error;
4426 }
4427 }
4428 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004429 if (surrogate && DECODE_DIRECT(ch)) {
4430 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4431 goto onError;
4432 }
4433 surrogate = 0;
4434 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004435 /* '-' is absorbed; other terminating
4436 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004437 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004438 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004439 }
4440 }
4441 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004443 s++; /* consume '+' */
4444 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004445 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004446 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004447 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004448 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004449 else if (s < e && !IS_BASE64(*s)) {
4450 s++;
4451 errmsg = "ill-formed sequence";
4452 goto utf7Error;
4453 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004454 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004455 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004456 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004457 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004458 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004459 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004460 }
4461 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004462 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004463 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004464 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004465 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004466 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004467 else {
4468 startinpos = s-starts;
4469 s++;
4470 errmsg = "unexpected special character";
4471 goto utf7Error;
4472 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004473 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004474utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004475 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004476 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004477 errors, &errorHandler,
4478 "utf7", errmsg,
4479 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004480 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004481 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004482 }
4483
Antoine Pitrou244651a2009-05-04 18:56:13 +00004484 /* end of string */
4485
4486 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4487 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004488 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004489 if (surrogate ||
4490 (base64bits >= 6) ||
4491 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004492 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004493 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004494 errors, &errorHandler,
4495 "utf7", "unterminated shift sequence",
4496 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004497 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004498 goto onError;
4499 if (s < e)
4500 goto restart;
4501 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004502 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004503
4504 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004505 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004506 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004507 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004508 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004509 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004510 writer.kind, writer.data, shiftOutStart);
4511 Py_XDECREF(errorHandler);
4512 Py_XDECREF(exc);
4513 _PyUnicodeWriter_Dealloc(&writer);
4514 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004515 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004516 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004517 }
4518 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004519 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004520 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004521 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004522
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004523 Py_XDECREF(errorHandler);
4524 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004525 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004526
Benjamin Peterson29060642009-01-31 22:14:21 +00004527 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004528 Py_XDECREF(errorHandler);
4529 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004530 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004531 return NULL;
4532}
4533
4534
Alexander Belopolsky40018472011-02-26 01:02:56 +00004535PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004536_PyUnicode_EncodeUTF7(PyObject *str,
4537 int base64SetO,
4538 int base64WhiteSpace,
4539 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004540{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004541 int kind;
4542 void *data;
4543 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004544 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004545 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004546 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004547 unsigned int base64bits = 0;
4548 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004549 char * out;
4550 char * start;
4551
Benjamin Petersonbac79492012-01-14 13:34:47 -05004552 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004553 return NULL;
4554 kind = PyUnicode_KIND(str);
4555 data = PyUnicode_DATA(str);
4556 len = PyUnicode_GET_LENGTH(str);
4557
4558 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004559 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004560
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004561 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004562 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004563 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004564 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004565 if (v == NULL)
4566 return NULL;
4567
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004568 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004569 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004570 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004571
Antoine Pitrou244651a2009-05-04 18:56:13 +00004572 if (inShift) {
4573 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4574 /* shifting out */
4575 if (base64bits) { /* output remaining bits */
4576 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4577 base64buffer = 0;
4578 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004579 }
4580 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581 /* Characters not in the BASE64 set implicitly unshift the sequence
4582 so no '-' is required, except if the character is itself a '-' */
4583 if (IS_BASE64(ch) || ch == '-') {
4584 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004585 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004586 *out++ = (char) ch;
4587 }
4588 else {
4589 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004590 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004591 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004592 else { /* not in a shift sequence */
4593 if (ch == '+') {
4594 *out++ = '+';
4595 *out++ = '-';
4596 }
4597 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4598 *out++ = (char) ch;
4599 }
4600 else {
4601 *out++ = '+';
4602 inShift = 1;
4603 goto encode_char;
4604 }
4605 }
4606 continue;
4607encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004608 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004609 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004610
Antoine Pitrou244651a2009-05-04 18:56:13 +00004611 /* code first surrogate */
4612 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004613 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004614 while (base64bits >= 6) {
4615 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4616 base64bits -= 6;
4617 }
4618 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004619 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004620 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004621 base64bits += 16;
4622 base64buffer = (base64buffer << 16) | ch;
4623 while (base64bits >= 6) {
4624 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4625 base64bits -= 6;
4626 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004627 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004628 if (base64bits)
4629 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4630 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004631 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004632 if (_PyBytes_Resize(&v, out - start) < 0)
4633 return NULL;
4634 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004635}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004636PyObject *
4637PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4638 Py_ssize_t size,
4639 int base64SetO,
4640 int base64WhiteSpace,
4641 const char *errors)
4642{
4643 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004644 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004645 if (tmp == NULL)
4646 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004647 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004648 base64WhiteSpace, errors);
4649 Py_DECREF(tmp);
4650 return result;
4651}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004652
Antoine Pitrou244651a2009-05-04 18:56:13 +00004653#undef IS_BASE64
4654#undef FROM_BASE64
4655#undef TO_BASE64
4656#undef DECODE_DIRECT
4657#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004658
Guido van Rossumd57fd912000-03-10 22:53:23 +00004659/* --- UTF-8 Codec -------------------------------------------------------- */
4660
Alexander Belopolsky40018472011-02-26 01:02:56 +00004661PyObject *
4662PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004663 Py_ssize_t size,
4664 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004665{
Walter Dörwald69652032004-09-07 20:24:22 +00004666 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4667}
4668
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004669#include "stringlib/asciilib.h"
4670#include "stringlib/codecs.h"
4671#include "stringlib/undef.h"
4672
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004673#include "stringlib/ucs1lib.h"
4674#include "stringlib/codecs.h"
4675#include "stringlib/undef.h"
4676
4677#include "stringlib/ucs2lib.h"
4678#include "stringlib/codecs.h"
4679#include "stringlib/undef.h"
4680
4681#include "stringlib/ucs4lib.h"
4682#include "stringlib/codecs.h"
4683#include "stringlib/undef.h"
4684
Antoine Pitrouab868312009-01-10 15:40:25 +00004685/* Mask to quickly check whether a C 'long' contains a
4686 non-ASCII, UTF8-encoded char. */
4687#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004688# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004689#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004690# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004691#else
4692# error C 'long' size should be either 4 or 8!
4693#endif
4694
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004695static Py_ssize_t
4696ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004697{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004698 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004699 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004700
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004701 /*
4702 * Issue #17237: m68k is a bit different from most architectures in
4703 * that objects do not use "natural alignment" - for example, int and
4704 * long are only aligned at 2-byte boundaries. Therefore the assert()
4705 * won't work; also, tests have shown that skipping the "optimised
4706 * version" will even speed up m68k.
4707 */
4708#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004709#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004710 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4711 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004712 /* Fast path, see in STRINGLIB(utf8_decode) for
4713 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004714 /* Help allocation */
4715 const char *_p = p;
4716 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004717 while (_p < aligned_end) {
4718 unsigned long value = *(const unsigned long *) _p;
4719 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004720 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004721 *((unsigned long *)q) = value;
4722 _p += SIZEOF_LONG;
4723 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004724 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004725 p = _p;
4726 while (p < end) {
4727 if ((unsigned char)*p & 0x80)
4728 break;
4729 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004731 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004733#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004734#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004735 while (p < end) {
4736 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4737 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004738 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004739 /* Help allocation */
4740 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004741 while (_p < aligned_end) {
4742 unsigned long value = *(unsigned long *) _p;
4743 if (value & ASCII_CHAR_MASK)
4744 break;
4745 _p += SIZEOF_LONG;
4746 }
4747 p = _p;
4748 if (_p == end)
4749 break;
4750 }
4751 if ((unsigned char)*p & 0x80)
4752 break;
4753 ++p;
4754 }
4755 memcpy(dest, start, p - start);
4756 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757}
Antoine Pitrouab868312009-01-10 15:40:25 +00004758
Victor Stinner785938e2011-12-11 20:09:03 +01004759PyObject *
4760PyUnicode_DecodeUTF8Stateful(const char *s,
4761 Py_ssize_t size,
4762 const char *errors,
4763 Py_ssize_t *consumed)
4764{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004765 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004766 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004767 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004768
4769 Py_ssize_t startinpos;
4770 Py_ssize_t endinpos;
4771 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004772 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004773 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004774 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004775
4776 if (size == 0) {
4777 if (consumed)
4778 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004779 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004780 }
4781
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004782 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4783 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004784 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004785 *consumed = 1;
4786 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004787 }
4788
Victor Stinner8f674cc2013-04-17 23:02:17 +02004789 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004790 writer.min_length = size;
4791 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004792 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004793
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004794 writer.pos = ascii_decode(s, end, writer.data);
4795 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004796 while (s < end) {
4797 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004798 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004799
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004800 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004801 if (PyUnicode_IS_ASCII(writer.buffer))
4802 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004803 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004804 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004805 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004806 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004807 } else {
4808 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004809 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004810 }
4811
4812 switch (ch) {
4813 case 0:
4814 if (s == end || consumed)
4815 goto End;
4816 errmsg = "unexpected end of data";
4817 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004818 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004819 break;
4820 case 1:
4821 errmsg = "invalid start byte";
4822 startinpos = s - starts;
4823 endinpos = startinpos + 1;
4824 break;
4825 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004826 case 3:
4827 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004828 errmsg = "invalid continuation byte";
4829 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004830 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004831 break;
4832 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004833 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004834 goto onError;
4835 continue;
4836 }
4837
Victor Stinner1d65d912015-10-05 13:43:50 +02004838 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02004839 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02004840
4841 switch (error_handler) {
4842 case _Py_ERROR_IGNORE:
4843 s += (endinpos - startinpos);
4844 break;
4845
4846 case _Py_ERROR_REPLACE:
4847 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4848 goto onError;
4849 s += (endinpos - startinpos);
4850 break;
4851
4852 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004853 {
4854 Py_ssize_t i;
4855
Victor Stinner1d65d912015-10-05 13:43:50 +02004856 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4857 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004858 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004859 ch = (Py_UCS4)(unsigned char)(starts[i]);
4860 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4861 ch + 0xdc00);
4862 writer.pos++;
4863 }
4864 s += (endinpos - startinpos);
4865 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004866 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004867
4868 default:
4869 if (unicode_decode_call_errorhandler_writer(
4870 errors, &error_handler_obj,
4871 "utf-8", errmsg,
4872 &starts, &end, &startinpos, &endinpos, &exc, &s,
4873 &writer))
4874 goto onError;
4875 }
Victor Stinner785938e2011-12-11 20:09:03 +01004876 }
4877
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004878End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004879 if (consumed)
4880 *consumed = s - starts;
4881
Victor Stinner1d65d912015-10-05 13:43:50 +02004882 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004883 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004884 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004885
4886onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02004887 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004888 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004889 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004890 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004891}
4892
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004893
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004894/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
4895 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004896
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004897 On success, write a pointer to a newly allocated wide character string into
4898 *wstr (use PyMem_RawFree() to free the memory) and write the output length
4899 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004900
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004901 On memory allocation failure, return -1.
4902
4903 On decoding error (if surrogateescape is zero), return -2. If wlen is
4904 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
4905 is not NULL, write the decoding error message into *reason. */
4906int
4907_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02004908 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004909{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004910 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004911 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004912 wchar_t *unicode;
4913 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004914
Victor Stinner3d4226a2018-08-29 22:21:32 +02004915 int surrogateescape = 0;
4916 int surrogatepass = 0;
4917 switch (errors)
4918 {
4919 case _Py_ERROR_STRICT:
4920 break;
4921 case _Py_ERROR_SURROGATEESCAPE:
4922 surrogateescape = 1;
4923 break;
4924 case _Py_ERROR_SURROGATEPASS:
4925 surrogatepass = 1;
4926 break;
4927 default:
4928 return -3;
4929 }
4930
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004931 /* Note: size will always be longer than the resulting Unicode
4932 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01004933 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004934 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004935 }
4936
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004937 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01004938 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004939 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004940 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004941
4942 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004943 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004944 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004945 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004946 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004947#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004948 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004949#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004950 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004951#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004952 if (ch > 0xFF) {
4953#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07004954 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004955#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02004956 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004957 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004958 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4959 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4960#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004961 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004962 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02004963 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004964 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004965 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02004966
4967 if (surrogateescape) {
4968 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4969 }
4970 else {
4971 /* Is it a valid three-byte code? */
4972 if (surrogatepass
4973 && (e - s) >= 3
4974 && (s[0] & 0xf0) == 0xe0
4975 && (s[1] & 0xc0) == 0x80
4976 && (s[2] & 0xc0) == 0x80)
4977 {
4978 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4979 s += 3;
4980 unicode[outpos++] = ch;
4981 }
4982 else {
4983 PyMem_RawFree(unicode );
4984 if (reason != NULL) {
4985 switch (ch) {
4986 case 0:
4987 *reason = "unexpected end of data";
4988 break;
4989 case 1:
4990 *reason = "invalid start byte";
4991 break;
4992 /* 2, 3, 4 */
4993 default:
4994 *reason = "invalid continuation byte";
4995 break;
4996 }
4997 }
4998 if (wlen != NULL) {
4999 *wlen = s - orig_s;
5000 }
5001 return -2;
5002 }
5003 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005004 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005005 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005006 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005007 if (wlen) {
5008 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005009 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005010 *wstr = unicode;
5011 return 0;
5012}
5013
5014wchar_t*
5015_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
5016{
5017 wchar_t *wstr;
5018 int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1);
5019 if (res != 0) {
5020 return NULL;
5021 }
5022 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005023}
5024
Antoine Pitrouab868312009-01-10 15:40:25 +00005025
Victor Stinnere47e6982017-12-21 15:45:16 +01005026/* UTF-8 encoder using the surrogateescape error handler .
5027
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005028 On success, return 0 and write the newly allocated character string (use
5029 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005030
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005031 On encoding failure, return -2 and write the position of the invalid
5032 surrogate character into *error_pos (if error_pos is set) and the decoding
5033 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005034
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005035 On memory allocation failure, return -1. */
5036int
5037_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005038 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005039{
5040 const Py_ssize_t max_char_size = 4;
5041 Py_ssize_t len = wcslen(text);
5042
5043 assert(len >= 0);
5044
Victor Stinner3d4226a2018-08-29 22:21:32 +02005045 int surrogateescape = 0;
5046 int surrogatepass = 0;
5047 switch (errors)
5048 {
5049 case _Py_ERROR_STRICT:
5050 break;
5051 case _Py_ERROR_SURROGATEESCAPE:
5052 surrogateescape = 1;
5053 break;
5054 case _Py_ERROR_SURROGATEPASS:
5055 surrogatepass = 1;
5056 break;
5057 default:
5058 return -3;
5059 }
5060
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005061 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5062 return -1;
5063 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005064 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005065 if (raw_malloc) {
5066 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005067 }
5068 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005069 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005070 }
5071 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005072 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005073 }
5074
5075 char *p = bytes;
5076 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005077 for (i = 0; i < len; ) {
5078 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005079 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005080 i++;
5081#if Py_UNICODE_SIZE == 2
5082 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5083 && i < len
5084 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5085 {
5086 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5087 i++;
5088 }
5089#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005090
5091 if (ch < 0x80) {
5092 /* Encode ASCII */
5093 *p++ = (char) ch;
5094
5095 }
5096 else if (ch < 0x0800) {
5097 /* Encode Latin-1 */
5098 *p++ = (char)(0xc0 | (ch >> 6));
5099 *p++ = (char)(0x80 | (ch & 0x3f));
5100 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005101 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005102 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005103 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005104 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005105 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005106 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005107 if (reason != NULL) {
5108 *reason = "encoding error";
5109 }
5110 if (raw_malloc) {
5111 PyMem_RawFree(bytes);
5112 }
5113 else {
5114 PyMem_Free(bytes);
5115 }
5116 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005117 }
5118 *p++ = (char)(ch & 0xff);
5119 }
5120 else if (ch < 0x10000) {
5121 *p++ = (char)(0xe0 | (ch >> 12));
5122 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5123 *p++ = (char)(0x80 | (ch & 0x3f));
5124 }
5125 else { /* ch >= 0x10000 */
5126 assert(ch <= MAX_UNICODE);
5127 /* Encode UCS4 Unicode ordinals */
5128 *p++ = (char)(0xf0 | (ch >> 18));
5129 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5130 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5131 *p++ = (char)(0x80 | (ch & 0x3f));
5132 }
5133 }
5134 *p++ = '\0';
5135
5136 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005137 char *bytes2;
5138 if (raw_malloc) {
5139 bytes2 = PyMem_RawRealloc(bytes, final_size);
5140 }
5141 else {
5142 bytes2 = PyMem_Realloc(bytes, final_size);
5143 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005144 if (bytes2 == NULL) {
5145 if (error_pos != NULL) {
5146 *error_pos = (size_t)-1;
5147 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005148 if (raw_malloc) {
5149 PyMem_RawFree(bytes);
5150 }
5151 else {
5152 PyMem_Free(bytes);
5153 }
5154 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005155 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005156 *str = bytes2;
5157 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005158}
5159
5160
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005161/* Primary internal function which creates utf8 encoded bytes objects.
5162
5163 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005164 and allocate exactly as much space needed at the end. Else allocate the
5165 maximum possible needed (4 result bytes per Unicode character), and return
5166 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005167*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005168PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005169_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170{
Victor Stinner6099a032011-12-18 14:22:26 +01005171 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005172 void *data;
5173 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005175 if (!PyUnicode_Check(unicode)) {
5176 PyErr_BadArgument();
5177 return NULL;
5178 }
5179
5180 if (PyUnicode_READY(unicode) == -1)
5181 return NULL;
5182
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005183 if (PyUnicode_UTF8(unicode))
5184 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5185 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005186
5187 kind = PyUnicode_KIND(unicode);
5188 data = PyUnicode_DATA(unicode);
5189 size = PyUnicode_GET_LENGTH(unicode);
5190
Benjamin Petersonead6b532011-12-20 17:23:42 -06005191 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005192 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005193 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005194 case PyUnicode_1BYTE_KIND:
5195 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5196 assert(!PyUnicode_IS_ASCII(unicode));
5197 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5198 case PyUnicode_2BYTE_KIND:
5199 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5200 case PyUnicode_4BYTE_KIND:
5201 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005202 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203}
5204
Alexander Belopolsky40018472011-02-26 01:02:56 +00005205PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005206PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5207 Py_ssize_t size,
5208 const char *errors)
5209{
5210 PyObject *v, *unicode;
5211
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005212 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005213 if (unicode == NULL)
5214 return NULL;
5215 v = _PyUnicode_AsUTF8String(unicode, errors);
5216 Py_DECREF(unicode);
5217 return v;
5218}
5219
5220PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005221PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005223 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224}
5225
Walter Dörwald41980ca2007-08-16 21:55:45 +00005226/* --- UTF-32 Codec ------------------------------------------------------- */
5227
5228PyObject *
5229PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005230 Py_ssize_t size,
5231 const char *errors,
5232 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005233{
5234 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5235}
5236
5237PyObject *
5238PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 Py_ssize_t size,
5240 const char *errors,
5241 int *byteorder,
5242 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005243{
5244 const char *starts = s;
5245 Py_ssize_t startinpos;
5246 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005247 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005248 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005249 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005250 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005251 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005252 PyObject *errorHandler = NULL;
5253 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005254
Walter Dörwald41980ca2007-08-16 21:55:45 +00005255 q = (unsigned char *)s;
5256 e = q + size;
5257
5258 if (byteorder)
5259 bo = *byteorder;
5260
5261 /* Check for BOM marks (U+FEFF) in the input and adjust current
5262 byte order setting accordingly. In native mode, the leading BOM
5263 mark is skipped, in all other modes, it is copied to the output
5264 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005265 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005266 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005267 if (bom == 0x0000FEFF) {
5268 bo = -1;
5269 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005270 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005271 else if (bom == 0xFFFE0000) {
5272 bo = 1;
5273 q += 4;
5274 }
5275 if (byteorder)
5276 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005277 }
5278
Victor Stinnere64322e2012-10-30 23:12:47 +01005279 if (q == e) {
5280 if (consumed)
5281 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005282 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005283 }
5284
Victor Stinnere64322e2012-10-30 23:12:47 +01005285#ifdef WORDS_BIGENDIAN
5286 le = bo < 0;
5287#else
5288 le = bo <= 0;
5289#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005290 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005291
Victor Stinner8f674cc2013-04-17 23:02:17 +02005292 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005293 writer.min_length = (e - q + 3) / 4;
5294 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005295 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005296
Victor Stinnere64322e2012-10-30 23:12:47 +01005297 while (1) {
5298 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005299 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005300
Victor Stinnere64322e2012-10-30 23:12:47 +01005301 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005302 enum PyUnicode_Kind kind = writer.kind;
5303 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005304 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005305 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005306 if (le) {
5307 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005308 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005309 if (ch > maxch)
5310 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005311 if (kind != PyUnicode_1BYTE_KIND &&
5312 Py_UNICODE_IS_SURROGATE(ch))
5313 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005314 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005315 q += 4;
5316 } while (q <= last);
5317 }
5318 else {
5319 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005320 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005321 if (ch > maxch)
5322 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005323 if (kind != PyUnicode_1BYTE_KIND &&
5324 Py_UNICODE_IS_SURROGATE(ch))
5325 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005326 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005327 q += 4;
5328 } while (q <= last);
5329 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005330 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005331 }
5332
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005333 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005334 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005335 startinpos = ((const char *)q) - starts;
5336 endinpos = startinpos + 4;
5337 }
5338 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005339 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005340 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005341 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005342 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005343 startinpos = ((const char *)q) - starts;
5344 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005345 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005346 else {
5347 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005348 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005349 goto onError;
5350 q += 4;
5351 continue;
5352 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005353 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005354 startinpos = ((const char *)q) - starts;
5355 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005356 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005357
5358 /* The remaining input chars are ignored if the callback
5359 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005360 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005361 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005362 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005363 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005364 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005366 }
5367
Walter Dörwald41980ca2007-08-16 21:55:45 +00005368 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005369 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005370
Walter Dörwald41980ca2007-08-16 21:55:45 +00005371 Py_XDECREF(errorHandler);
5372 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005373 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005374
Benjamin Peterson29060642009-01-31 22:14:21 +00005375 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005376 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005377 Py_XDECREF(errorHandler);
5378 Py_XDECREF(exc);
5379 return NULL;
5380}
5381
5382PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005383_PyUnicode_EncodeUTF32(PyObject *str,
5384 const char *errors,
5385 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005386{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005387 enum PyUnicode_Kind kind;
5388 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005389 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005390 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005391 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005392#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005393 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005394#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005395 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005396#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005397 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005398 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005399 PyObject *errorHandler = NULL;
5400 PyObject *exc = NULL;
5401 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005402
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005403 if (!PyUnicode_Check(str)) {
5404 PyErr_BadArgument();
5405 return NULL;
5406 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005407 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005408 return NULL;
5409 kind = PyUnicode_KIND(str);
5410 data = PyUnicode_DATA(str);
5411 len = PyUnicode_GET_LENGTH(str);
5412
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005413 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005414 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005415 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005416 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005417 if (v == NULL)
5418 return NULL;
5419
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005420 /* output buffer is 4-bytes aligned */
5421 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005422 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005423 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005424 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005425 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005426 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005427
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005428 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005429 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005430 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005431 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005432 else
5433 encoding = "utf-32";
5434
5435 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005436 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5437 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005438 }
5439
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005440 pos = 0;
5441 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005442 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005443
5444 if (kind == PyUnicode_2BYTE_KIND) {
5445 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5446 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005447 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005448 else {
5449 assert(kind == PyUnicode_4BYTE_KIND);
5450 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5451 &out, native_ordering);
5452 }
5453 if (pos == len)
5454 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005455
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005456 rep = unicode_encode_call_errorhandler(
5457 errors, &errorHandler,
5458 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005459 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005460 if (!rep)
5461 goto error;
5462
5463 if (PyBytes_Check(rep)) {
5464 repsize = PyBytes_GET_SIZE(rep);
5465 if (repsize & 3) {
5466 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005467 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005468 "surrogates not allowed");
5469 goto error;
5470 }
5471 moreunits = repsize / 4;
5472 }
5473 else {
5474 assert(PyUnicode_Check(rep));
5475 if (PyUnicode_READY(rep) < 0)
5476 goto error;
5477 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5478 if (!PyUnicode_IS_ASCII(rep)) {
5479 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005480 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005481 "surrogates not allowed");
5482 goto error;
5483 }
5484 }
5485
5486 /* four bytes are reserved for each surrogate */
5487 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005488 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005489 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005490 /* integer overflow */
5491 PyErr_NoMemory();
5492 goto error;
5493 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005494 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005495 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005496 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005497 }
5498
5499 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005500 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005501 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005502 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005503 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005504 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5505 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005506 }
5507
5508 Py_CLEAR(rep);
5509 }
5510
5511 /* Cut back to size actually needed. This is necessary for, for example,
5512 encoding of a string containing isolated surrogates and the 'ignore'
5513 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005514 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005515 if (nsize != PyBytes_GET_SIZE(v))
5516 _PyBytes_Resize(&v, nsize);
5517 Py_XDECREF(errorHandler);
5518 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005519 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005520 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005521 error:
5522 Py_XDECREF(rep);
5523 Py_XDECREF(errorHandler);
5524 Py_XDECREF(exc);
5525 Py_XDECREF(v);
5526 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005527}
5528
Alexander Belopolsky40018472011-02-26 01:02:56 +00005529PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005530PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5531 Py_ssize_t size,
5532 const char *errors,
5533 int byteorder)
5534{
5535 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005536 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005537 if (tmp == NULL)
5538 return NULL;
5539 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5540 Py_DECREF(tmp);
5541 return result;
5542}
5543
5544PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005545PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005546{
Victor Stinnerb960b342011-11-20 19:12:52 +01005547 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005548}
5549
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550/* --- UTF-16 Codec ------------------------------------------------------- */
5551
Tim Peters772747b2001-08-09 22:21:55 +00005552PyObject *
5553PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005554 Py_ssize_t size,
5555 const char *errors,
5556 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557{
Walter Dörwald69652032004-09-07 20:24:22 +00005558 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5559}
5560
5561PyObject *
5562PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005563 Py_ssize_t size,
5564 const char *errors,
5565 int *byteorder,
5566 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005567{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005568 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005569 Py_ssize_t startinpos;
5570 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005571 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005572 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005573 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005574 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005575 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005576 PyObject *errorHandler = NULL;
5577 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005578 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579
Tim Peters772747b2001-08-09 22:21:55 +00005580 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005581 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582
5583 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005584 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005586 /* Check for BOM marks (U+FEFF) in the input and adjust current
5587 byte order setting accordingly. In native mode, the leading BOM
5588 mark is skipped, in all other modes, it is copied to the output
5589 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005590 if (bo == 0 && size >= 2) {
5591 const Py_UCS4 bom = (q[1] << 8) | q[0];
5592 if (bom == 0xFEFF) {
5593 q += 2;
5594 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005595 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005596 else if (bom == 0xFFFE) {
5597 q += 2;
5598 bo = 1;
5599 }
5600 if (byteorder)
5601 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603
Antoine Pitrou63065d72012-05-15 23:48:04 +02005604 if (q == e) {
5605 if (consumed)
5606 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005607 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005608 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005609
Christian Heimes743e0cd2012-10-17 23:52:17 +02005610#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005611 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005612 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005613#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005614 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005615 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005616#endif
Tim Peters772747b2001-08-09 22:21:55 +00005617
Antoine Pitrou63065d72012-05-15 23:48:04 +02005618 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005619 character count normally. Error handler will take care of
5620 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005621 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005622 writer.min_length = (e - q + 1) / 2;
5623 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005624 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005625
Antoine Pitrou63065d72012-05-15 23:48:04 +02005626 while (1) {
5627 Py_UCS4 ch = 0;
5628 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005629 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005630 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005631 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005632 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005633 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005634 native_ordering);
5635 else
5636 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005637 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005638 native_ordering);
5639 } else if (kind == PyUnicode_2BYTE_KIND) {
5640 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005641 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005642 native_ordering);
5643 } else {
5644 assert(kind == PyUnicode_4BYTE_KIND);
5645 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005646 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005647 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005648 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005649 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005650
Antoine Pitrou63065d72012-05-15 23:48:04 +02005651 switch (ch)
5652 {
5653 case 0:
5654 /* remaining byte at the end? (size should be even) */
5655 if (q == e || consumed)
5656 goto End;
5657 errmsg = "truncated data";
5658 startinpos = ((const char *)q) - starts;
5659 endinpos = ((const char *)e) - starts;
5660 break;
5661 /* The remaining input chars are ignored if the callback
5662 chooses to skip the input */
5663 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005664 q -= 2;
5665 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005666 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005667 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005668 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005669 endinpos = ((const char *)e) - starts;
5670 break;
5671 case 2:
5672 errmsg = "illegal encoding";
5673 startinpos = ((const char *)q) - 2 - starts;
5674 endinpos = startinpos + 2;
5675 break;
5676 case 3:
5677 errmsg = "illegal UTF-16 surrogate";
5678 startinpos = ((const char *)q) - 4 - starts;
5679 endinpos = startinpos + 2;
5680 break;
5681 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005682 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005683 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005684 continue;
5685 }
5686
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005687 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005688 errors,
5689 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005690 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005691 &starts,
5692 (const char **)&e,
5693 &startinpos,
5694 &endinpos,
5695 &exc,
5696 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005697 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005698 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 }
5700
Antoine Pitrou63065d72012-05-15 23:48:04 +02005701End:
Walter Dörwald69652032004-09-07 20:24:22 +00005702 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005703 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005704
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005705 Py_XDECREF(errorHandler);
5706 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005707 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708
Benjamin Peterson29060642009-01-31 22:14:21 +00005709 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005710 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005711 Py_XDECREF(errorHandler);
5712 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713 return NULL;
5714}
5715
Tim Peters772747b2001-08-09 22:21:55 +00005716PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005717_PyUnicode_EncodeUTF16(PyObject *str,
5718 const char *errors,
5719 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005721 enum PyUnicode_Kind kind;
5722 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005723 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005724 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005725 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005726 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005727#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005728 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005729#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005730 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005731#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005732 const char *encoding;
5733 Py_ssize_t nsize, pos;
5734 PyObject *errorHandler = NULL;
5735 PyObject *exc = NULL;
5736 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005737
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005738 if (!PyUnicode_Check(str)) {
5739 PyErr_BadArgument();
5740 return NULL;
5741 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005742 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005743 return NULL;
5744 kind = PyUnicode_KIND(str);
5745 data = PyUnicode_DATA(str);
5746 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005747
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005748 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005749 if (kind == PyUnicode_4BYTE_KIND) {
5750 const Py_UCS4 *in = (const Py_UCS4 *)data;
5751 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005752 while (in < end) {
5753 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005754 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005755 }
5756 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005757 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005758 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005759 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005760 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005761 nsize = len + pairs + (byteorder == 0);
5762 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005763 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005765 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005767 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005768 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005769 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005770 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005771 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005772 }
5773 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005774 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005775 }
Tim Peters772747b2001-08-09 22:21:55 +00005776
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005777 if (kind == PyUnicode_1BYTE_KIND) {
5778 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5779 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005780 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005781
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005782 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005783 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005784 }
5785 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005786 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005787 }
5788 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005789 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005790 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005791
5792 pos = 0;
5793 while (pos < len) {
5794 Py_ssize_t repsize, moreunits;
5795
5796 if (kind == PyUnicode_2BYTE_KIND) {
5797 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5798 &out, native_ordering);
5799 }
5800 else {
5801 assert(kind == PyUnicode_4BYTE_KIND);
5802 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5803 &out, native_ordering);
5804 }
5805 if (pos == len)
5806 break;
5807
5808 rep = unicode_encode_call_errorhandler(
5809 errors, &errorHandler,
5810 encoding, "surrogates not allowed",
5811 str, &exc, pos, pos + 1, &pos);
5812 if (!rep)
5813 goto error;
5814
5815 if (PyBytes_Check(rep)) {
5816 repsize = PyBytes_GET_SIZE(rep);
5817 if (repsize & 1) {
5818 raise_encode_exception(&exc, encoding,
5819 str, pos - 1, pos,
5820 "surrogates not allowed");
5821 goto error;
5822 }
5823 moreunits = repsize / 2;
5824 }
5825 else {
5826 assert(PyUnicode_Check(rep));
5827 if (PyUnicode_READY(rep) < 0)
5828 goto error;
5829 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5830 if (!PyUnicode_IS_ASCII(rep)) {
5831 raise_encode_exception(&exc, encoding,
5832 str, pos - 1, pos,
5833 "surrogates not allowed");
5834 goto error;
5835 }
5836 }
5837
5838 /* two bytes are reserved for each surrogate */
5839 if (moreunits > 1) {
5840 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005841 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005842 /* integer overflow */
5843 PyErr_NoMemory();
5844 goto error;
5845 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005846 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005847 goto error;
5848 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5849 }
5850
5851 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005852 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005853 out += moreunits;
5854 } else /* rep is unicode */ {
5855 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5856 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5857 &out, native_ordering);
5858 }
5859
5860 Py_CLEAR(rep);
5861 }
5862
5863 /* Cut back to size actually needed. This is necessary for, for example,
5864 encoding of a string containing isolated surrogates and the 'ignore' handler
5865 is used. */
5866 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5867 if (nsize != PyBytes_GET_SIZE(v))
5868 _PyBytes_Resize(&v, nsize);
5869 Py_XDECREF(errorHandler);
5870 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005871 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005872 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005873 error:
5874 Py_XDECREF(rep);
5875 Py_XDECREF(errorHandler);
5876 Py_XDECREF(exc);
5877 Py_XDECREF(v);
5878 return NULL;
5879#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880}
5881
Alexander Belopolsky40018472011-02-26 01:02:56 +00005882PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005883PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5884 Py_ssize_t size,
5885 const char *errors,
5886 int byteorder)
5887{
5888 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005889 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005890 if (tmp == NULL)
5891 return NULL;
5892 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5893 Py_DECREF(tmp);
5894 return result;
5895}
5896
5897PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005898PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005900 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901}
5902
5903/* --- Unicode Escape Codec ----------------------------------------------- */
5904
Fredrik Lundh06d12682001-01-24 07:59:11 +00005905static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005906
Alexander Belopolsky40018472011-02-26 01:02:56 +00005907PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005908_PyUnicode_DecodeUnicodeEscape(const char *s,
5909 Py_ssize_t size,
5910 const char *errors,
5911 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005913 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005914 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005916 PyObject *errorHandler = NULL;
5917 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005918
Eric V. Smith42454af2016-10-31 09:22:08 -04005919 // so we can remember if we've seen an invalid escape char or not
5920 *first_invalid_escape = NULL;
5921
Victor Stinner62ec3312016-09-06 17:04:34 -07005922 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005923 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005924 }
5925 /* Escaped strings will always be longer than the resulting
5926 Unicode string, so we start with size here and then reduce the
5927 length after conversion to the true value.
5928 (but if the error callback returns a long replacement string
5929 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005930 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005931 writer.min_length = size;
5932 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5933 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005934 }
5935
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 end = s + size;
5937 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005938 unsigned char c = (unsigned char) *s++;
5939 Py_UCS4 ch;
5940 int count;
5941 Py_ssize_t startinpos;
5942 Py_ssize_t endinpos;
5943 const char *message;
5944
5945#define WRITE_ASCII_CHAR(ch) \
5946 do { \
5947 assert(ch <= 127); \
5948 assert(writer.pos < writer.size); \
5949 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5950 } while(0)
5951
5952#define WRITE_CHAR(ch) \
5953 do { \
5954 if (ch <= writer.maxchar) { \
5955 assert(writer.pos < writer.size); \
5956 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5957 } \
5958 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5959 goto onError; \
5960 } \
5961 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962
5963 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005964 if (c != '\\') {
5965 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 continue;
5967 }
5968
Victor Stinner62ec3312016-09-06 17:04:34 -07005969 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005971 if (s >= end) {
5972 message = "\\ at end of string";
5973 goto error;
5974 }
5975 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005976
Victor Stinner62ec3312016-09-06 17:04:34 -07005977 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005978 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979
Benjamin Peterson29060642009-01-31 22:14:21 +00005980 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005981 case '\n': continue;
5982 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5983 case '\'': WRITE_ASCII_CHAR('\''); continue;
5984 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5985 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005986 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005987 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5988 case 't': WRITE_ASCII_CHAR('\t'); continue;
5989 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5990 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005991 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005992 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005993 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005994 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995
Benjamin Peterson29060642009-01-31 22:14:21 +00005996 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 case '0': case '1': case '2': case '3':
5998 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07005999 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006000 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006001 ch = (ch<<3) + *s++ - '0';
6002 if (s < end && '0' <= *s && *s <= '7') {
6003 ch = (ch<<3) + *s++ - '0';
6004 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006006 WRITE_CHAR(ch);
6007 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008
Benjamin Peterson29060642009-01-31 22:14:21 +00006009 /* hex escapes */
6010 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006012 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006013 message = "truncated \\xXX escape";
6014 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015
Benjamin Peterson29060642009-01-31 22:14:21 +00006016 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006018 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006019 message = "truncated \\uXXXX escape";
6020 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006023 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006024 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006025 message = "truncated \\UXXXXXXXX escape";
6026 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006027 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006028 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006029 ch <<= 4;
6030 if (c >= '0' && c <= '9') {
6031 ch += c - '0';
6032 }
6033 else if (c >= 'a' && c <= 'f') {
6034 ch += c - ('a' - 10);
6035 }
6036 else if (c >= 'A' && c <= 'F') {
6037 ch += c - ('A' - 10);
6038 }
6039 else {
6040 break;
6041 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006042 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006043 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006044 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006045 }
6046
6047 /* when we get here, ch is a 32-bit unicode character */
6048 if (ch > MAX_UNICODE) {
6049 message = "illegal Unicode character";
6050 goto error;
6051 }
6052
6053 WRITE_CHAR(ch);
6054 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006055
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006057 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006058 if (ucnhash_CAPI == NULL) {
6059 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006060 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6061 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006062 if (ucnhash_CAPI == NULL) {
6063 PyErr_SetString(
6064 PyExc_UnicodeError,
6065 "\\N escapes not supported (can't load unicodedata module)"
6066 );
6067 goto onError;
6068 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006069 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006070
6071 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006072 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006073 const char *start = ++s;
6074 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006075 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006076 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006077 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006078 namelen = s - start;
6079 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006080 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006081 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006082 ch = 0xffffffff; /* in case 'getcode' messes up */
6083 if (namelen <= INT_MAX &&
6084 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6085 &ch, 0)) {
6086 assert(ch <= MAX_UNICODE);
6087 WRITE_CHAR(ch);
6088 continue;
6089 }
6090 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006091 }
6092 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006093 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006094
6095 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006096 if (*first_invalid_escape == NULL) {
6097 *first_invalid_escape = s-1; /* Back up one char, since we've
6098 already incremented s. */
6099 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006100 WRITE_ASCII_CHAR('\\');
6101 WRITE_CHAR(c);
6102 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006104
6105 error:
6106 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006107 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006108 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006109 errors, &errorHandler,
6110 "unicodeescape", message,
6111 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006112 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006113 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006114 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006115 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006116
6117#undef WRITE_ASCII_CHAR
6118#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006120
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006121 Py_XDECREF(errorHandler);
6122 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006123 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006124
Benjamin Peterson29060642009-01-31 22:14:21 +00006125 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006126 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006127 Py_XDECREF(errorHandler);
6128 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 return NULL;
6130}
6131
Eric V. Smith42454af2016-10-31 09:22:08 -04006132PyObject *
6133PyUnicode_DecodeUnicodeEscape(const char *s,
6134 Py_ssize_t size,
6135 const char *errors)
6136{
6137 const char *first_invalid_escape;
6138 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6139 &first_invalid_escape);
6140 if (result == NULL)
6141 return NULL;
6142 if (first_invalid_escape != NULL) {
6143 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6144 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006145 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006146 Py_DECREF(result);
6147 return NULL;
6148 }
6149 }
6150 return result;
6151}
6152
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006153/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154
Alexander Belopolsky40018472011-02-26 01:02:56 +00006155PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006156PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006158 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006159 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006161 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006162 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006163 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164
Ezio Melottie7f90372012-10-05 03:33:31 +03006165 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006166 escape.
6167
Ezio Melottie7f90372012-10-05 03:33:31 +03006168 For UCS1 strings it's '\xxx', 4 bytes per source character.
6169 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6170 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006171 */
6172
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006173 if (!PyUnicode_Check(unicode)) {
6174 PyErr_BadArgument();
6175 return NULL;
6176 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006177 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006178 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006179 }
Victor Stinner358af132015-10-12 22:36:57 +02006180
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006181 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006182 if (len == 0) {
6183 return PyBytes_FromStringAndSize(NULL, 0);
6184 }
6185
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006186 kind = PyUnicode_KIND(unicode);
6187 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006188 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6189 bytes, and 1 byte characters 4. */
6190 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006191 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006192 return PyErr_NoMemory();
6193 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006194 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006195 if (repr == NULL) {
6196 return NULL;
6197 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006198
Victor Stinner62ec3312016-09-06 17:04:34 -07006199 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006200 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006201 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006202
Victor Stinner62ec3312016-09-06 17:04:34 -07006203 /* U+0000-U+00ff range */
6204 if (ch < 0x100) {
6205 if (ch >= ' ' && ch < 127) {
6206 if (ch != '\\') {
6207 /* Copy printable US ASCII as-is */
6208 *p++ = (char) ch;
6209 }
6210 /* Escape backslashes */
6211 else {
6212 *p++ = '\\';
6213 *p++ = '\\';
6214 }
6215 }
Victor Stinner358af132015-10-12 22:36:57 +02006216
Victor Stinner62ec3312016-09-06 17:04:34 -07006217 /* Map special whitespace to '\t', \n', '\r' */
6218 else if (ch == '\t') {
6219 *p++ = '\\';
6220 *p++ = 't';
6221 }
6222 else if (ch == '\n') {
6223 *p++ = '\\';
6224 *p++ = 'n';
6225 }
6226 else if (ch == '\r') {
6227 *p++ = '\\';
6228 *p++ = 'r';
6229 }
6230
6231 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6232 else {
6233 *p++ = '\\';
6234 *p++ = 'x';
6235 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6236 *p++ = Py_hexdigits[ch & 0x000F];
6237 }
Tim Petersced69f82003-09-16 20:30:58 +00006238 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006239 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006240 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 *p++ = '\\';
6242 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006243 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6244 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6245 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6246 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006248 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6249 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006250
Victor Stinner62ec3312016-09-06 17:04:34 -07006251 /* Make sure that the first two digits are zero */
6252 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006253 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006254 *p++ = 'U';
6255 *p++ = '0';
6256 *p++ = '0';
6257 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6258 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6259 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6260 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6261 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6262 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006263 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265
Victor Stinner62ec3312016-09-06 17:04:34 -07006266 assert(p - PyBytes_AS_STRING(repr) > 0);
6267 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6268 return NULL;
6269 }
6270 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271}
6272
Alexander Belopolsky40018472011-02-26 01:02:56 +00006273PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006274PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6275 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006277 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006278 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006279 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006281 }
6282
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006283 result = PyUnicode_AsUnicodeEscapeString(tmp);
6284 Py_DECREF(tmp);
6285 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286}
6287
6288/* --- Raw Unicode Escape Codec ------------------------------------------- */
6289
Alexander Belopolsky40018472011-02-26 01:02:56 +00006290PyObject *
6291PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006292 Py_ssize_t size,
6293 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006295 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006296 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006298 PyObject *errorHandler = NULL;
6299 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006300
Victor Stinner62ec3312016-09-06 17:04:34 -07006301 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006302 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006303 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006304
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305 /* Escaped strings will always be longer than the resulting
6306 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006307 length after conversion to the true value. (But decoding error
6308 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006309 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006310 writer.min_length = size;
6311 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6312 goto onError;
6313 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006314
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315 end = s + size;
6316 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006317 unsigned char c = (unsigned char) *s++;
6318 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006319 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006320 Py_ssize_t startinpos;
6321 Py_ssize_t endinpos;
6322 const char *message;
6323
6324#define WRITE_CHAR(ch) \
6325 do { \
6326 if (ch <= writer.maxchar) { \
6327 assert(writer.pos < writer.size); \
6328 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6329 } \
6330 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6331 goto onError; \
6332 } \
6333 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334
Benjamin Peterson29060642009-01-31 22:14:21 +00006335 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006336 if (c != '\\' || s >= end) {
6337 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006338 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006339 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006340
Victor Stinner62ec3312016-09-06 17:04:34 -07006341 c = (unsigned char) *s++;
6342 if (c == 'u') {
6343 count = 4;
6344 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006345 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006346 else if (c == 'U') {
6347 count = 8;
6348 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006349 }
6350 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006351 assert(writer.pos < writer.size);
6352 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6353 WRITE_CHAR(c);
6354 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006355 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006356 startinpos = s - starts - 2;
6357
6358 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6359 for (ch = 0; count && s < end; ++s, --count) {
6360 c = (unsigned char)*s;
6361 ch <<= 4;
6362 if (c >= '0' && c <= '9') {
6363 ch += c - '0';
6364 }
6365 else if (c >= 'a' && c <= 'f') {
6366 ch += c - ('a' - 10);
6367 }
6368 else if (c >= 'A' && c <= 'F') {
6369 ch += c - ('A' - 10);
6370 }
6371 else {
6372 break;
6373 }
6374 }
6375 if (!count) {
6376 if (ch <= MAX_UNICODE) {
6377 WRITE_CHAR(ch);
6378 continue;
6379 }
6380 message = "\\Uxxxxxxxx out of range";
6381 }
6382
6383 endinpos = s-starts;
6384 writer.min_length = end - s + writer.pos;
6385 if (unicode_decode_call_errorhandler_writer(
6386 errors, &errorHandler,
6387 "rawunicodeescape", message,
6388 &starts, &end, &startinpos, &endinpos, &exc, &s,
6389 &writer)) {
6390 goto onError;
6391 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006392 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006393
6394#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006396 Py_XDECREF(errorHandler);
6397 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006398 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006399
Benjamin Peterson29060642009-01-31 22:14:21 +00006400 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006401 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006402 Py_XDECREF(errorHandler);
6403 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006405
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406}
6407
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006408
Alexander Belopolsky40018472011-02-26 01:02:56 +00006409PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006410PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411{
Victor Stinner62ec3312016-09-06 17:04:34 -07006412 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006414 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006415 int kind;
6416 void *data;
6417 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006419 if (!PyUnicode_Check(unicode)) {
6420 PyErr_BadArgument();
6421 return NULL;
6422 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006423 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006424 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006425 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006426 kind = PyUnicode_KIND(unicode);
6427 data = PyUnicode_DATA(unicode);
6428 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006429 if (kind == PyUnicode_1BYTE_KIND) {
6430 return PyBytes_FromStringAndSize(data, len);
6431 }
Victor Stinner0e368262011-11-10 20:12:49 +01006432
Victor Stinner62ec3312016-09-06 17:04:34 -07006433 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6434 bytes, and 1 byte characters 4. */
6435 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006436
Victor Stinner62ec3312016-09-06 17:04:34 -07006437 if (len > PY_SSIZE_T_MAX / expandsize) {
6438 return PyErr_NoMemory();
6439 }
6440 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6441 if (repr == NULL) {
6442 return NULL;
6443 }
6444 if (len == 0) {
6445 return repr;
6446 }
6447
6448 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006449 for (pos = 0; pos < len; pos++) {
6450 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006451
Victor Stinner62ec3312016-09-06 17:04:34 -07006452 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6453 if (ch < 0x100) {
6454 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006455 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006456 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006457 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458 *p++ = '\\';
6459 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006460 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6461 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6462 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6463 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006465 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6466 else {
6467 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6468 *p++ = '\\';
6469 *p++ = 'U';
6470 *p++ = '0';
6471 *p++ = '0';
6472 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6473 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6474 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6475 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6476 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6477 *p++ = Py_hexdigits[ch & 15];
6478 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006480
Victor Stinner62ec3312016-09-06 17:04:34 -07006481 assert(p > PyBytes_AS_STRING(repr));
6482 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6483 return NULL;
6484 }
6485 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486}
6487
Alexander Belopolsky40018472011-02-26 01:02:56 +00006488PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006489PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6490 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006492 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006493 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006494 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006495 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006496 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6497 Py_DECREF(tmp);
6498 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499}
6500
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006501/* --- Unicode Internal Codec ------------------------------------------- */
6502
Alexander Belopolsky40018472011-02-26 01:02:56 +00006503PyObject *
6504_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006505 Py_ssize_t size,
6506 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006507{
6508 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006509 Py_ssize_t startinpos;
6510 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006511 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006512 const char *end;
6513 const char *reason;
6514 PyObject *errorHandler = NULL;
6515 PyObject *exc = NULL;
6516
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006517 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006518 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006519 1))
6520 return NULL;
6521
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006522 if (size < 0) {
6523 PyErr_BadInternalCall();
6524 return NULL;
6525 }
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006526 if (size == 0)
6527 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006528
Victor Stinner8f674cc2013-04-17 23:02:17 +02006529 _PyUnicodeWriter_Init(&writer);
6530 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6531 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006532 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006533 }
6534 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006535
Victor Stinner8f674cc2013-04-17 23:02:17 +02006536 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006537 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006538 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006539 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006540 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006541 endinpos = end-starts;
6542 reason = "truncated input";
6543 goto error;
6544 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006545 /* We copy the raw representation one byte at a time because the
6546 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006547 ((char *) &uch)[0] = s[0];
6548 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006549#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006550 ((char *) &uch)[2] = s[2];
6551 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006552#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006553 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006554#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006555 /* We have to sanity check the raw data, otherwise doom looms for
6556 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006557 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006558 endinpos = s - starts + Py_UNICODE_SIZE;
6559 reason = "illegal code point (> 0x10FFFF)";
6560 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006561 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006562#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006563 s += Py_UNICODE_SIZE;
6564#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006565 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006566 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006567 Py_UNICODE uch2;
6568 ((char *) &uch2)[0] = s[0];
6569 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006570 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006571 {
Victor Stinner551ac952011-11-29 22:58:13 +01006572 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006573 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006574 }
6575 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006576#endif
6577
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006578 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006579 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006580 continue;
6581
6582 error:
6583 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006584 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006585 errors, &errorHandler,
6586 "unicode_internal", reason,
6587 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006588 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006589 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006590 }
6591
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006592 Py_XDECREF(errorHandler);
6593 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006594 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006595
Benjamin Peterson29060642009-01-31 22:14:21 +00006596 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006597 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006598 Py_XDECREF(errorHandler);
6599 Py_XDECREF(exc);
6600 return NULL;
6601}
6602
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603/* --- Latin-1 Codec ------------------------------------------------------ */
6604
Alexander Belopolsky40018472011-02-26 01:02:56 +00006605PyObject *
6606PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006607 Py_ssize_t size,
6608 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006611 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612}
6613
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006614/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006615static void
6616make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006617 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006618 PyObject *unicode,
6619 Py_ssize_t startpos, Py_ssize_t endpos,
6620 const char *reason)
6621{
6622 if (*exceptionObject == NULL) {
6623 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006624 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006625 encoding, unicode, startpos, endpos, reason);
6626 }
6627 else {
6628 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6629 goto onError;
6630 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6631 goto onError;
6632 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6633 goto onError;
6634 return;
6635 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006636 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006637 }
6638}
6639
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006640/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006641static void
6642raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006643 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006644 PyObject *unicode,
6645 Py_ssize_t startpos, Py_ssize_t endpos,
6646 const char *reason)
6647{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006648 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006649 encoding, unicode, startpos, endpos, reason);
6650 if (*exceptionObject != NULL)
6651 PyCodec_StrictErrors(*exceptionObject);
6652}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006653
6654/* error handling callback helper:
6655 build arguments, call the callback and check the arguments,
6656 put the result into newpos and return the replacement string, which
6657 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006658static PyObject *
6659unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006660 PyObject **errorHandler,
6661 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006662 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006663 Py_ssize_t startpos, Py_ssize_t endpos,
6664 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006665{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006666 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006667 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006668 PyObject *restuple;
6669 PyObject *resunicode;
6670
6671 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006673 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006674 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006675 }
6676
Benjamin Petersonbac79492012-01-14 13:34:47 -05006677 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006678 return NULL;
6679 len = PyUnicode_GET_LENGTH(unicode);
6680
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006681 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006682 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006683 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006684 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006685
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006686 restuple = PyObject_CallFunctionObjArgs(
6687 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006688 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006689 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006690 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006691 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006692 Py_DECREF(restuple);
6693 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006694 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006695 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006696 &resunicode, newpos)) {
6697 Py_DECREF(restuple);
6698 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006699 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006700 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6701 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6702 Py_DECREF(restuple);
6703 return NULL;
6704 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006705 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006706 *newpos = len + *newpos;
6707 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006708 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006709 Py_DECREF(restuple);
6710 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006711 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006712 Py_INCREF(resunicode);
6713 Py_DECREF(restuple);
6714 return resunicode;
6715}
6716
Alexander Belopolsky40018472011-02-26 01:02:56 +00006717static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006718unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006719 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006720 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006721{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006722 /* input state */
6723 Py_ssize_t pos=0, size;
6724 int kind;
6725 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006726 /* pointer into the output */
6727 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006728 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6729 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006730 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006731 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006732 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006733 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006734 /* output object */
6735 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006736
Benjamin Petersonbac79492012-01-14 13:34:47 -05006737 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006738 return NULL;
6739 size = PyUnicode_GET_LENGTH(unicode);
6740 kind = PyUnicode_KIND(unicode);
6741 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006742 /* allocate enough for a simple encoding without
6743 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006744 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006745 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006746
6747 _PyBytesWriter_Init(&writer);
6748 str = _PyBytesWriter_Alloc(&writer, size);
6749 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006750 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006751
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006752 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006753 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006754
Benjamin Peterson29060642009-01-31 22:14:21 +00006755 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006756 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006757 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006758 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006759 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006760 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006761 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006762 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006764 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006765 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006767
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006768 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006769 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006770
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006771 /* Only overallocate the buffer if it's not the last write */
6772 writer.overallocate = (collend < size);
6773
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006775 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006776 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006777
6778 switch (error_handler) {
6779 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006780 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006782
6783 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006784 memset(str, '?', collend - collstart);
6785 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006786 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006787 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006788 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006789 break;
Victor Stinner50149202015-09-22 00:26:54 +02006790
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006791 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006792 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006793 writer.min_size -= (collend - collstart);
6794 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006795 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006796 if (str == NULL)
6797 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006798 pos = collend;
6799 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006800
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006801 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006802 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006803 writer.min_size -= (collend - collstart);
6804 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006805 unicode, collstart, collend);
6806 if (str == NULL)
6807 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006808 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006809 break;
Victor Stinner50149202015-09-22 00:26:54 +02006810
Victor Stinnerc3713e92015-09-29 12:32:13 +02006811 case _Py_ERROR_SURROGATEESCAPE:
6812 for (i = collstart; i < collend; ++i) {
6813 ch = PyUnicode_READ(kind, data, i);
6814 if (ch < 0xdc80 || 0xdcff < ch) {
6815 /* Not a UTF-8b surrogate */
6816 break;
6817 }
6818 *str++ = (char)(ch - 0xdc00);
6819 ++pos;
6820 }
6821 if (i >= collend)
6822 break;
6823 collstart = pos;
6824 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006825 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006826
Benjamin Peterson29060642009-01-31 22:14:21 +00006827 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006828 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6829 encoding, reason, unicode, &exc,
6830 collstart, collend, &newpos);
6831 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006832 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006833
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006834 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006835 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006836
Victor Stinner6bd525b2015-10-09 13:10:05 +02006837 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006838 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006839 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006840 PyBytes_AS_STRING(rep),
6841 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006842 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006843 else {
6844 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006845
Victor Stinner6bd525b2015-10-09 13:10:05 +02006846 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006847 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006848
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006849 if (limit == 256 ?
6850 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6851 !PyUnicode_IS_ASCII(rep))
6852 {
6853 /* Not all characters are smaller than limit */
6854 raise_encode_exception(&exc, encoding, unicode,
6855 collstart, collend, reason);
6856 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006857 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006858 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6859 str = _PyBytesWriter_WriteBytes(&writer, str,
6860 PyUnicode_DATA(rep),
6861 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006862 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006863 if (str == NULL)
6864 goto onError;
6865
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006866 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006867 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006868 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006869
6870 /* If overallocation was disabled, ensure that it was the last
6871 write. Otherwise, we missed an optimization */
6872 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006873 }
6874 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006875
Victor Stinner50149202015-09-22 00:26:54 +02006876 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006877 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006878 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006879
6880 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006881 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006882 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006883 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006884 Py_XDECREF(exc);
6885 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006886}
6887
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006888/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006889PyObject *
6890PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006891 Py_ssize_t size,
6892 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006894 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006895 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006896 if (unicode == NULL)
6897 return NULL;
6898 result = unicode_encode_ucs1(unicode, errors, 256);
6899 Py_DECREF(unicode);
6900 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901}
6902
Alexander Belopolsky40018472011-02-26 01:02:56 +00006903PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006904_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905{
6906 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006907 PyErr_BadArgument();
6908 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006910 if (PyUnicode_READY(unicode) == -1)
6911 return NULL;
6912 /* Fast path: if it is a one-byte string, construct
6913 bytes object directly. */
6914 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6915 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6916 PyUnicode_GET_LENGTH(unicode));
6917 /* Non-Latin-1 characters present. Defer to above function to
6918 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006919 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006920}
6921
6922PyObject*
6923PyUnicode_AsLatin1String(PyObject *unicode)
6924{
6925 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926}
6927
6928/* --- 7-bit ASCII Codec -------------------------------------------------- */
6929
Alexander Belopolsky40018472011-02-26 01:02:56 +00006930PyObject *
6931PyUnicode_DecodeASCII(const char *s,
6932 Py_ssize_t size,
6933 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006935 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006936 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006937 int kind;
6938 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006939 Py_ssize_t startinpos;
6940 Py_ssize_t endinpos;
6941 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006942 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006943 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006944 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006945 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006946
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006948 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006949
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006951 if (size == 1 && (unsigned char)s[0] < 128)
6952 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006953
Victor Stinner8f674cc2013-04-17 23:02:17 +02006954 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006955 writer.min_length = size;
6956 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006957 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006958
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006959 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006960 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006961 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006962 writer.pos = outpos;
6963 if (writer.pos == size)
6964 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006965
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006966 s += writer.pos;
6967 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006968 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006969 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006970 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006971 PyUnicode_WRITE(kind, data, writer.pos, c);
6972 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006974 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006975 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006976
6977 /* byte outsize range 0x00..0x7f: call the error handler */
6978
6979 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006980 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02006981
6982 switch (error_handler)
6983 {
6984 case _Py_ERROR_REPLACE:
6985 case _Py_ERROR_SURROGATEESCAPE:
6986 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006987 but we may switch to UCS2 at the first write */
6988 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6989 goto onError;
6990 kind = writer.kind;
6991 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006992
6993 if (error_handler == _Py_ERROR_REPLACE)
6994 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6995 else
6996 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6997 writer.pos++;
6998 ++s;
6999 break;
7000
7001 case _Py_ERROR_IGNORE:
7002 ++s;
7003 break;
7004
7005 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007006 startinpos = s-starts;
7007 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007008 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007009 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007010 "ascii", "ordinal not in range(128)",
7011 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007012 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007013 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007014 kind = writer.kind;
7015 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007016 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007018 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007019 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007020 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007021
Benjamin Peterson29060642009-01-31 22:14:21 +00007022 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007023 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007024 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007025 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026 return NULL;
7027}
7028
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007029/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007030PyObject *
7031PyUnicode_EncodeASCII(const Py_UNICODE *p,
7032 Py_ssize_t size,
7033 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007035 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007036 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007037 if (unicode == NULL)
7038 return NULL;
7039 result = unicode_encode_ucs1(unicode, errors, 128);
7040 Py_DECREF(unicode);
7041 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042}
7043
Alexander Belopolsky40018472011-02-26 01:02:56 +00007044PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007045_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046{
7047 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007048 PyErr_BadArgument();
7049 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007051 if (PyUnicode_READY(unicode) == -1)
7052 return NULL;
7053 /* Fast path: if it is an ASCII-only string, construct bytes object
7054 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007055 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007056 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7057 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007058 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007059}
7060
7061PyObject *
7062PyUnicode_AsASCIIString(PyObject *unicode)
7063{
7064 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065}
7066
Steve Dowercc16be82016-09-08 10:35:16 -07007067#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007068
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007069/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007070
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007071#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007072#define NEED_RETRY
7073#endif
7074
Victor Stinner3a50e702011-10-18 21:21:00 +02007075#ifndef WC_ERR_INVALID_CHARS
7076# define WC_ERR_INVALID_CHARS 0x0080
7077#endif
7078
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007079static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007080code_page_name(UINT code_page, PyObject **obj)
7081{
7082 *obj = NULL;
7083 if (code_page == CP_ACP)
7084 return "mbcs";
7085 if (code_page == CP_UTF7)
7086 return "CP_UTF7";
7087 if (code_page == CP_UTF8)
7088 return "CP_UTF8";
7089
7090 *obj = PyBytes_FromFormat("cp%u", code_page);
7091 if (*obj == NULL)
7092 return NULL;
7093 return PyBytes_AS_STRING(*obj);
7094}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007095
Victor Stinner3a50e702011-10-18 21:21:00 +02007096static DWORD
7097decode_code_page_flags(UINT code_page)
7098{
7099 if (code_page == CP_UTF7) {
7100 /* The CP_UTF7 decoder only supports flags=0 */
7101 return 0;
7102 }
7103 else
7104 return MB_ERR_INVALID_CHARS;
7105}
7106
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007107/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007108 * Decode a byte string from a Windows code page into unicode object in strict
7109 * mode.
7110 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007111 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7112 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007113 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007114static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007115decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007116 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007117 const char *in,
7118 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007119{
Victor Stinner3a50e702011-10-18 21:21:00 +02007120 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007121 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007122 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007123
7124 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007125 assert(insize > 0);
7126 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7127 if (outsize <= 0)
7128 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007129
7130 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007131 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007132 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007133 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007134 if (*v == NULL)
7135 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007136 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007137 }
7138 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007139 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007140 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007141 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007142 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007143 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007144 }
7145
7146 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007147 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7148 if (outsize <= 0)
7149 goto error;
7150 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007151
Victor Stinner3a50e702011-10-18 21:21:00 +02007152error:
7153 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7154 return -2;
7155 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007156 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007157}
7158
Victor Stinner3a50e702011-10-18 21:21:00 +02007159/*
7160 * Decode a byte string from a code page into unicode object with an error
7161 * handler.
7162 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007163 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007164 * UnicodeDecodeError exception and returns -1 on error.
7165 */
7166static int
7167decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007168 PyObject **v,
7169 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007170 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007171{
7172 const char *startin = in;
7173 const char *endin = in + size;
7174 const DWORD flags = decode_code_page_flags(code_page);
7175 /* Ideally, we should get reason from FormatMessage. This is the Windows
7176 2000 English version of the message. */
7177 const char *reason = "No mapping for the Unicode character exists "
7178 "in the target code page.";
7179 /* each step cannot decode more than 1 character, but a character can be
7180 represented as a surrogate pair */
7181 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007182 int insize;
7183 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007184 PyObject *errorHandler = NULL;
7185 PyObject *exc = NULL;
7186 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007187 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007188 DWORD err;
7189 int ret = -1;
7190
7191 assert(size > 0);
7192
7193 encoding = code_page_name(code_page, &encoding_obj);
7194 if (encoding == NULL)
7195 return -1;
7196
Victor Stinner7d00cc12014-03-17 23:08:06 +01007197 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007198 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7199 UnicodeDecodeError. */
7200 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7201 if (exc != NULL) {
7202 PyCodec_StrictErrors(exc);
7203 Py_CLEAR(exc);
7204 }
7205 goto error;
7206 }
7207
7208 if (*v == NULL) {
7209 /* Create unicode object */
7210 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7211 PyErr_NoMemory();
7212 goto error;
7213 }
Victor Stinnerab595942011-12-17 04:59:06 +01007214 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007215 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007216 if (*v == NULL)
7217 goto error;
7218 startout = PyUnicode_AS_UNICODE(*v);
7219 }
7220 else {
7221 /* Extend unicode object */
7222 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7223 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7224 PyErr_NoMemory();
7225 goto error;
7226 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007227 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007228 goto error;
7229 startout = PyUnicode_AS_UNICODE(*v) + n;
7230 }
7231
7232 /* Decode the byte string character per character */
7233 out = startout;
7234 while (in < endin)
7235 {
7236 /* Decode a character */
7237 insize = 1;
7238 do
7239 {
7240 outsize = MultiByteToWideChar(code_page, flags,
7241 in, insize,
7242 buffer, Py_ARRAY_LENGTH(buffer));
7243 if (outsize > 0)
7244 break;
7245 err = GetLastError();
7246 if (err != ERROR_NO_UNICODE_TRANSLATION
7247 && err != ERROR_INSUFFICIENT_BUFFER)
7248 {
7249 PyErr_SetFromWindowsErr(0);
7250 goto error;
7251 }
7252 insize++;
7253 }
7254 /* 4=maximum length of a UTF-8 sequence */
7255 while (insize <= 4 && (in + insize) <= endin);
7256
7257 if (outsize <= 0) {
7258 Py_ssize_t startinpos, endinpos, outpos;
7259
Victor Stinner7d00cc12014-03-17 23:08:06 +01007260 /* last character in partial decode? */
7261 if (in + insize >= endin && !final)
7262 break;
7263
Victor Stinner3a50e702011-10-18 21:21:00 +02007264 startinpos = in - startin;
7265 endinpos = startinpos + 1;
7266 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007267 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007268 errors, &errorHandler,
7269 encoding, reason,
7270 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007271 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007272 {
7273 goto error;
7274 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007275 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007276 }
7277 else {
7278 in += insize;
7279 memcpy(out, buffer, outsize * sizeof(wchar_t));
7280 out += outsize;
7281 }
7282 }
7283
7284 /* write a NUL character at the end */
7285 *out = 0;
7286
7287 /* Extend unicode object */
7288 outsize = out - startout;
7289 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007290 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007291 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007292 /* (in - startin) <= size and size is an int */
7293 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007294
7295error:
7296 Py_XDECREF(encoding_obj);
7297 Py_XDECREF(errorHandler);
7298 Py_XDECREF(exc);
7299 return ret;
7300}
7301
Victor Stinner3a50e702011-10-18 21:21:00 +02007302static PyObject *
7303decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007304 const char *s, Py_ssize_t size,
7305 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007306{
Victor Stinner76a31a62011-11-04 00:05:13 +01007307 PyObject *v = NULL;
7308 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007309
Victor Stinner3a50e702011-10-18 21:21:00 +02007310 if (code_page < 0) {
7311 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7312 return NULL;
7313 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007314 if (size < 0) {
7315 PyErr_BadInternalCall();
7316 return NULL;
7317 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007318
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007319 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007320 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007321
Victor Stinner76a31a62011-11-04 00:05:13 +01007322 do
7323 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007324#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007325 if (size > INT_MAX) {
7326 chunk_size = INT_MAX;
7327 final = 0;
7328 done = 0;
7329 }
7330 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007331#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007332 {
7333 chunk_size = (int)size;
7334 final = (consumed == NULL);
7335 done = 1;
7336 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007337
Victor Stinner76a31a62011-11-04 00:05:13 +01007338 if (chunk_size == 0 && done) {
7339 if (v != NULL)
7340 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007341 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007342 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007343
Victor Stinner76a31a62011-11-04 00:05:13 +01007344 converted = decode_code_page_strict(code_page, &v,
7345 s, chunk_size);
7346 if (converted == -2)
7347 converted = decode_code_page_errors(code_page, &v,
7348 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007349 errors, final);
7350 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007351
7352 if (converted < 0) {
7353 Py_XDECREF(v);
7354 return NULL;
7355 }
7356
7357 if (consumed)
7358 *consumed += converted;
7359
7360 s += converted;
7361 size -= converted;
7362 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007363
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007364 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007365}
7366
Alexander Belopolsky40018472011-02-26 01:02:56 +00007367PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007368PyUnicode_DecodeCodePageStateful(int code_page,
7369 const char *s,
7370 Py_ssize_t size,
7371 const char *errors,
7372 Py_ssize_t *consumed)
7373{
7374 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7375}
7376
7377PyObject *
7378PyUnicode_DecodeMBCSStateful(const char *s,
7379 Py_ssize_t size,
7380 const char *errors,
7381 Py_ssize_t *consumed)
7382{
7383 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7384}
7385
7386PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007387PyUnicode_DecodeMBCS(const char *s,
7388 Py_ssize_t size,
7389 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007390{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007391 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7392}
7393
Victor Stinner3a50e702011-10-18 21:21:00 +02007394static DWORD
7395encode_code_page_flags(UINT code_page, const char *errors)
7396{
7397 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007398 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007399 }
7400 else if (code_page == CP_UTF7) {
7401 /* CP_UTF7 only supports flags=0 */
7402 return 0;
7403 }
7404 else {
7405 if (errors != NULL && strcmp(errors, "replace") == 0)
7406 return 0;
7407 else
7408 return WC_NO_BEST_FIT_CHARS;
7409 }
7410}
7411
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007412/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007413 * Encode a Unicode string to a Windows code page into a byte string in strict
7414 * mode.
7415 *
7416 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007417 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007418 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007419static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007420encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007421 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007422 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007423{
Victor Stinner554f3f02010-06-16 23:33:54 +00007424 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007425 BOOL *pusedDefaultChar = &usedDefaultChar;
7426 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007427 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007428 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007429 const DWORD flags = encode_code_page_flags(code_page, NULL);
7430 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007431 /* Create a substring so that we can get the UTF-16 representation
7432 of just the slice under consideration. */
7433 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007434
Martin v. Löwis3d325192011-11-04 18:23:06 +01007435 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007436
Victor Stinner3a50e702011-10-18 21:21:00 +02007437 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007438 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007439 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007440 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007441
Victor Stinner2fc507f2011-11-04 20:06:39 +01007442 substring = PyUnicode_Substring(unicode, offset, offset+len);
7443 if (substring == NULL)
7444 return -1;
7445 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7446 if (p == NULL) {
7447 Py_DECREF(substring);
7448 return -1;
7449 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007450 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007451
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007452 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007453 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007454 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007455 NULL, 0,
7456 NULL, pusedDefaultChar);
7457 if (outsize <= 0)
7458 goto error;
7459 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007460 if (pusedDefaultChar && *pusedDefaultChar) {
7461 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007462 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007463 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007464
Victor Stinner3a50e702011-10-18 21:21:00 +02007465 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007466 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007467 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007468 if (*outbytes == NULL) {
7469 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007470 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007471 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007472 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007473 }
7474 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007475 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007476 const Py_ssize_t n = PyBytes_Size(*outbytes);
7477 if (outsize > PY_SSIZE_T_MAX - n) {
7478 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007479 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007480 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007481 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007482 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7483 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007484 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007485 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007486 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007487 }
7488
7489 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007490 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007491 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007492 out, outsize,
7493 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007494 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007495 if (outsize <= 0)
7496 goto error;
7497 if (pusedDefaultChar && *pusedDefaultChar)
7498 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007499 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007500
Victor Stinner3a50e702011-10-18 21:21:00 +02007501error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007502 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007503 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7504 return -2;
7505 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007506 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007507}
7508
Victor Stinner3a50e702011-10-18 21:21:00 +02007509/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007510 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007511 * error handler.
7512 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007513 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007514 * -1 on other error.
7515 */
7516static int
7517encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007518 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007519 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007520{
Victor Stinner3a50e702011-10-18 21:21:00 +02007521 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007522 Py_ssize_t pos = unicode_offset;
7523 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007524 /* Ideally, we should get reason from FormatMessage. This is the Windows
7525 2000 English version of the message. */
7526 const char *reason = "invalid character";
7527 /* 4=maximum length of a UTF-8 sequence */
7528 char buffer[4];
7529 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7530 Py_ssize_t outsize;
7531 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007532 PyObject *errorHandler = NULL;
7533 PyObject *exc = NULL;
7534 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007535 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007536 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007537 PyObject *rep;
7538 int ret = -1;
7539
7540 assert(insize > 0);
7541
7542 encoding = code_page_name(code_page, &encoding_obj);
7543 if (encoding == NULL)
7544 return -1;
7545
7546 if (errors == NULL || strcmp(errors, "strict") == 0) {
7547 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7548 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007549 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007550 if (exc != NULL) {
7551 PyCodec_StrictErrors(exc);
7552 Py_DECREF(exc);
7553 }
7554 Py_XDECREF(encoding_obj);
7555 return -1;
7556 }
7557
7558 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7559 pusedDefaultChar = &usedDefaultChar;
7560 else
7561 pusedDefaultChar = NULL;
7562
7563 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7564 PyErr_NoMemory();
7565 goto error;
7566 }
7567 outsize = insize * Py_ARRAY_LENGTH(buffer);
7568
7569 if (*outbytes == NULL) {
7570 /* Create string object */
7571 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7572 if (*outbytes == NULL)
7573 goto error;
7574 out = PyBytes_AS_STRING(*outbytes);
7575 }
7576 else {
7577 /* Extend string object */
7578 Py_ssize_t n = PyBytes_Size(*outbytes);
7579 if (n > PY_SSIZE_T_MAX - outsize) {
7580 PyErr_NoMemory();
7581 goto error;
7582 }
7583 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7584 goto error;
7585 out = PyBytes_AS_STRING(*outbytes) + n;
7586 }
7587
7588 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007589 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007590 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007591 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7592 wchar_t chars[2];
7593 int charsize;
7594 if (ch < 0x10000) {
7595 chars[0] = (wchar_t)ch;
7596 charsize = 1;
7597 }
7598 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007599 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7600 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007601 charsize = 2;
7602 }
7603
Victor Stinner3a50e702011-10-18 21:21:00 +02007604 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007605 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007606 buffer, Py_ARRAY_LENGTH(buffer),
7607 NULL, pusedDefaultChar);
7608 if (outsize > 0) {
7609 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7610 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007611 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007612 memcpy(out, buffer, outsize);
7613 out += outsize;
7614 continue;
7615 }
7616 }
7617 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7618 PyErr_SetFromWindowsErr(0);
7619 goto error;
7620 }
7621
Victor Stinner3a50e702011-10-18 21:21:00 +02007622 rep = unicode_encode_call_errorhandler(
7623 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007624 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007625 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007626 if (rep == NULL)
7627 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007628 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007629
7630 if (PyBytes_Check(rep)) {
7631 outsize = PyBytes_GET_SIZE(rep);
7632 if (outsize != 1) {
7633 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7634 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7635 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7636 Py_DECREF(rep);
7637 goto error;
7638 }
7639 out = PyBytes_AS_STRING(*outbytes) + offset;
7640 }
7641 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7642 out += outsize;
7643 }
7644 else {
7645 Py_ssize_t i;
7646 enum PyUnicode_Kind kind;
7647 void *data;
7648
Benjamin Petersonbac79492012-01-14 13:34:47 -05007649 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007650 Py_DECREF(rep);
7651 goto error;
7652 }
7653
7654 outsize = PyUnicode_GET_LENGTH(rep);
7655 if (outsize != 1) {
7656 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7657 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7658 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7659 Py_DECREF(rep);
7660 goto error;
7661 }
7662 out = PyBytes_AS_STRING(*outbytes) + offset;
7663 }
7664 kind = PyUnicode_KIND(rep);
7665 data = PyUnicode_DATA(rep);
7666 for (i=0; i < outsize; i++) {
7667 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7668 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007669 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007670 encoding, unicode,
7671 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007672 "unable to encode error handler result to ASCII");
7673 Py_DECREF(rep);
7674 goto error;
7675 }
7676 *out = (unsigned char)ch;
7677 out++;
7678 }
7679 }
7680 Py_DECREF(rep);
7681 }
7682 /* write a NUL byte */
7683 *out = 0;
7684 outsize = out - PyBytes_AS_STRING(*outbytes);
7685 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7686 if (_PyBytes_Resize(outbytes, outsize) < 0)
7687 goto error;
7688 ret = 0;
7689
7690error:
7691 Py_XDECREF(encoding_obj);
7692 Py_XDECREF(errorHandler);
7693 Py_XDECREF(exc);
7694 return ret;
7695}
7696
Victor Stinner3a50e702011-10-18 21:21:00 +02007697static PyObject *
7698encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007699 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007700 const char *errors)
7701{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007702 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007703 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007704 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007705 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007706
Victor Stinner29dacf22015-01-26 16:41:32 +01007707 if (!PyUnicode_Check(unicode)) {
7708 PyErr_BadArgument();
7709 return NULL;
7710 }
7711
Benjamin Petersonbac79492012-01-14 13:34:47 -05007712 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007713 return NULL;
7714 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007715
Victor Stinner3a50e702011-10-18 21:21:00 +02007716 if (code_page < 0) {
7717 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7718 return NULL;
7719 }
7720
Martin v. Löwis3d325192011-11-04 18:23:06 +01007721 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007722 return PyBytes_FromStringAndSize(NULL, 0);
7723
Victor Stinner7581cef2011-11-03 22:32:33 +01007724 offset = 0;
7725 do
7726 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007727#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007728 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007729 chunks. */
7730 if (len > INT_MAX/2) {
7731 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007732 done = 0;
7733 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007734 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007735#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007736 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007737 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007738 done = 1;
7739 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007740
Victor Stinner76a31a62011-11-04 00:05:13 +01007741 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007742 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007743 errors);
7744 if (ret == -2)
7745 ret = encode_code_page_errors(code_page, &outbytes,
7746 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007747 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007748 if (ret < 0) {
7749 Py_XDECREF(outbytes);
7750 return NULL;
7751 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007752
Victor Stinner7581cef2011-11-03 22:32:33 +01007753 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007754 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007755 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007756
Victor Stinner3a50e702011-10-18 21:21:00 +02007757 return outbytes;
7758}
7759
7760PyObject *
7761PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7762 Py_ssize_t size,
7763 const char *errors)
7764{
Victor Stinner7581cef2011-11-03 22:32:33 +01007765 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007766 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007767 if (unicode == NULL)
7768 return NULL;
7769 res = encode_code_page(CP_ACP, unicode, errors);
7770 Py_DECREF(unicode);
7771 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007772}
7773
7774PyObject *
7775PyUnicode_EncodeCodePage(int code_page,
7776 PyObject *unicode,
7777 const char *errors)
7778{
Victor Stinner7581cef2011-11-03 22:32:33 +01007779 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007780}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007781
Alexander Belopolsky40018472011-02-26 01:02:56 +00007782PyObject *
7783PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007784{
Victor Stinner7581cef2011-11-03 22:32:33 +01007785 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007786}
7787
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007788#undef NEED_RETRY
7789
Steve Dowercc16be82016-09-08 10:35:16 -07007790#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007791
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792/* --- Character Mapping Codec -------------------------------------------- */
7793
Victor Stinnerfb161b12013-04-18 01:44:27 +02007794static int
7795charmap_decode_string(const char *s,
7796 Py_ssize_t size,
7797 PyObject *mapping,
7798 const char *errors,
7799 _PyUnicodeWriter *writer)
7800{
7801 const char *starts = s;
7802 const char *e;
7803 Py_ssize_t startinpos, endinpos;
7804 PyObject *errorHandler = NULL, *exc = NULL;
7805 Py_ssize_t maplen;
7806 enum PyUnicode_Kind mapkind;
7807 void *mapdata;
7808 Py_UCS4 x;
7809 unsigned char ch;
7810
7811 if (PyUnicode_READY(mapping) == -1)
7812 return -1;
7813
7814 maplen = PyUnicode_GET_LENGTH(mapping);
7815 mapdata = PyUnicode_DATA(mapping);
7816 mapkind = PyUnicode_KIND(mapping);
7817
7818 e = s + size;
7819
7820 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7821 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7822 * is disabled in encoding aliases, latin1 is preferred because
7823 * its implementation is faster. */
7824 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7825 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7826 Py_UCS4 maxchar = writer->maxchar;
7827
7828 assert (writer->kind == PyUnicode_1BYTE_KIND);
7829 while (s < e) {
7830 ch = *s;
7831 x = mapdata_ucs1[ch];
7832 if (x > maxchar) {
7833 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7834 goto onError;
7835 maxchar = writer->maxchar;
7836 outdata = (Py_UCS1 *)writer->data;
7837 }
7838 outdata[writer->pos] = x;
7839 writer->pos++;
7840 ++s;
7841 }
7842 return 0;
7843 }
7844
7845 while (s < e) {
7846 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7847 enum PyUnicode_Kind outkind = writer->kind;
7848 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7849 if (outkind == PyUnicode_1BYTE_KIND) {
7850 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7851 Py_UCS4 maxchar = writer->maxchar;
7852 while (s < e) {
7853 ch = *s;
7854 x = mapdata_ucs2[ch];
7855 if (x > maxchar)
7856 goto Error;
7857 outdata[writer->pos] = x;
7858 writer->pos++;
7859 ++s;
7860 }
7861 break;
7862 }
7863 else if (outkind == PyUnicode_2BYTE_KIND) {
7864 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7865 while (s < e) {
7866 ch = *s;
7867 x = mapdata_ucs2[ch];
7868 if (x == 0xFFFE)
7869 goto Error;
7870 outdata[writer->pos] = x;
7871 writer->pos++;
7872 ++s;
7873 }
7874 break;
7875 }
7876 }
7877 ch = *s;
7878
7879 if (ch < maplen)
7880 x = PyUnicode_READ(mapkind, mapdata, ch);
7881 else
7882 x = 0xfffe; /* invalid value */
7883Error:
7884 if (x == 0xfffe)
7885 {
7886 /* undefined mapping */
7887 startinpos = s-starts;
7888 endinpos = startinpos+1;
7889 if (unicode_decode_call_errorhandler_writer(
7890 errors, &errorHandler,
7891 "charmap", "character maps to <undefined>",
7892 &starts, &e, &startinpos, &endinpos, &exc, &s,
7893 writer)) {
7894 goto onError;
7895 }
7896 continue;
7897 }
7898
7899 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7900 goto onError;
7901 ++s;
7902 }
7903 Py_XDECREF(errorHandler);
7904 Py_XDECREF(exc);
7905 return 0;
7906
7907onError:
7908 Py_XDECREF(errorHandler);
7909 Py_XDECREF(exc);
7910 return -1;
7911}
7912
7913static int
7914charmap_decode_mapping(const char *s,
7915 Py_ssize_t size,
7916 PyObject *mapping,
7917 const char *errors,
7918 _PyUnicodeWriter *writer)
7919{
7920 const char *starts = s;
7921 const char *e;
7922 Py_ssize_t startinpos, endinpos;
7923 PyObject *errorHandler = NULL, *exc = NULL;
7924 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007925 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007926
7927 e = s + size;
7928
7929 while (s < e) {
7930 ch = *s;
7931
7932 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7933 key = PyLong_FromLong((long)ch);
7934 if (key == NULL)
7935 goto onError;
7936
7937 item = PyObject_GetItem(mapping, key);
7938 Py_DECREF(key);
7939 if (item == NULL) {
7940 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7941 /* No mapping found means: mapping is undefined. */
7942 PyErr_Clear();
7943 goto Undefined;
7944 } else
7945 goto onError;
7946 }
7947
7948 /* Apply mapping */
7949 if (item == Py_None)
7950 goto Undefined;
7951 if (PyLong_Check(item)) {
7952 long value = PyLong_AS_LONG(item);
7953 if (value == 0xFFFE)
7954 goto Undefined;
7955 if (value < 0 || value > MAX_UNICODE) {
7956 PyErr_Format(PyExc_TypeError,
7957 "character mapping must be in range(0x%lx)",
7958 (unsigned long)MAX_UNICODE + 1);
7959 goto onError;
7960 }
7961
7962 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7963 goto onError;
7964 }
7965 else if (PyUnicode_Check(item)) {
7966 if (PyUnicode_READY(item) == -1)
7967 goto onError;
7968 if (PyUnicode_GET_LENGTH(item) == 1) {
7969 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7970 if (value == 0xFFFE)
7971 goto Undefined;
7972 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7973 goto onError;
7974 }
7975 else {
7976 writer->overallocate = 1;
7977 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7978 goto onError;
7979 }
7980 }
7981 else {
7982 /* wrong return value */
7983 PyErr_SetString(PyExc_TypeError,
7984 "character mapping must return integer, None or str");
7985 goto onError;
7986 }
7987 Py_CLEAR(item);
7988 ++s;
7989 continue;
7990
7991Undefined:
7992 /* undefined mapping */
7993 Py_CLEAR(item);
7994 startinpos = s-starts;
7995 endinpos = startinpos+1;
7996 if (unicode_decode_call_errorhandler_writer(
7997 errors, &errorHandler,
7998 "charmap", "character maps to <undefined>",
7999 &starts, &e, &startinpos, &endinpos, &exc, &s,
8000 writer)) {
8001 goto onError;
8002 }
8003 }
8004 Py_XDECREF(errorHandler);
8005 Py_XDECREF(exc);
8006 return 0;
8007
8008onError:
8009 Py_XDECREF(item);
8010 Py_XDECREF(errorHandler);
8011 Py_XDECREF(exc);
8012 return -1;
8013}
8014
Alexander Belopolsky40018472011-02-26 01:02:56 +00008015PyObject *
8016PyUnicode_DecodeCharmap(const char *s,
8017 Py_ssize_t size,
8018 PyObject *mapping,
8019 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008021 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008022
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023 /* Default to Latin-1 */
8024 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008025 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008028 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008029 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008030 writer.min_length = size;
8031 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008033
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008034 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008035 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8036 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008037 }
8038 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008039 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8040 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008042 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008043
Benjamin Peterson29060642009-01-31 22:14:21 +00008044 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008045 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046 return NULL;
8047}
8048
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008049/* Charmap encoding: the lookup table */
8050
Alexander Belopolsky40018472011-02-26 01:02:56 +00008051struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008052 PyObject_HEAD
8053 unsigned char level1[32];
8054 int count2, count3;
8055 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008056};
8057
8058static PyObject*
8059encoding_map_size(PyObject *obj, PyObject* args)
8060{
8061 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008062 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008063 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008064}
8065
8066static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008067 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008068 PyDoc_STR("Return the size (in bytes) of this object") },
8069 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008070};
8071
8072static void
8073encoding_map_dealloc(PyObject* o)
8074{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008075 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008076}
8077
8078static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008079 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008080 "EncodingMap", /*tp_name*/
8081 sizeof(struct encoding_map), /*tp_basicsize*/
8082 0, /*tp_itemsize*/
8083 /* methods */
8084 encoding_map_dealloc, /*tp_dealloc*/
8085 0, /*tp_print*/
8086 0, /*tp_getattr*/
8087 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008088 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008089 0, /*tp_repr*/
8090 0, /*tp_as_number*/
8091 0, /*tp_as_sequence*/
8092 0, /*tp_as_mapping*/
8093 0, /*tp_hash*/
8094 0, /*tp_call*/
8095 0, /*tp_str*/
8096 0, /*tp_getattro*/
8097 0, /*tp_setattro*/
8098 0, /*tp_as_buffer*/
8099 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8100 0, /*tp_doc*/
8101 0, /*tp_traverse*/
8102 0, /*tp_clear*/
8103 0, /*tp_richcompare*/
8104 0, /*tp_weaklistoffset*/
8105 0, /*tp_iter*/
8106 0, /*tp_iternext*/
8107 encoding_map_methods, /*tp_methods*/
8108 0, /*tp_members*/
8109 0, /*tp_getset*/
8110 0, /*tp_base*/
8111 0, /*tp_dict*/
8112 0, /*tp_descr_get*/
8113 0, /*tp_descr_set*/
8114 0, /*tp_dictoffset*/
8115 0, /*tp_init*/
8116 0, /*tp_alloc*/
8117 0, /*tp_new*/
8118 0, /*tp_free*/
8119 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008120};
8121
8122PyObject*
8123PyUnicode_BuildEncodingMap(PyObject* string)
8124{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008125 PyObject *result;
8126 struct encoding_map *mresult;
8127 int i;
8128 int need_dict = 0;
8129 unsigned char level1[32];
8130 unsigned char level2[512];
8131 unsigned char *mlevel1, *mlevel2, *mlevel3;
8132 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008133 int kind;
8134 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008135 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008136 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008137
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008138 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008139 PyErr_BadArgument();
8140 return NULL;
8141 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008142 kind = PyUnicode_KIND(string);
8143 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008144 length = PyUnicode_GET_LENGTH(string);
8145 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008146 memset(level1, 0xFF, sizeof level1);
8147 memset(level2, 0xFF, sizeof level2);
8148
8149 /* If there isn't a one-to-one mapping of NULL to \0,
8150 or if there are non-BMP characters, we need to use
8151 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008152 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008153 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008154 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008155 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008156 ch = PyUnicode_READ(kind, data, i);
8157 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008158 need_dict = 1;
8159 break;
8160 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008161 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008162 /* unmapped character */
8163 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008164 l1 = ch >> 11;
8165 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008166 if (level1[l1] == 0xFF)
8167 level1[l1] = count2++;
8168 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008169 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008170 }
8171
8172 if (count2 >= 0xFF || count3 >= 0xFF)
8173 need_dict = 1;
8174
8175 if (need_dict) {
8176 PyObject *result = PyDict_New();
8177 PyObject *key, *value;
8178 if (!result)
8179 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008180 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008181 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008182 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008183 if (!key || !value)
8184 goto failed1;
8185 if (PyDict_SetItem(result, key, value) == -1)
8186 goto failed1;
8187 Py_DECREF(key);
8188 Py_DECREF(value);
8189 }
8190 return result;
8191 failed1:
8192 Py_XDECREF(key);
8193 Py_XDECREF(value);
8194 Py_DECREF(result);
8195 return NULL;
8196 }
8197
8198 /* Create a three-level trie */
8199 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8200 16*count2 + 128*count3 - 1);
8201 if (!result)
8202 return PyErr_NoMemory();
8203 PyObject_Init(result, &EncodingMapType);
8204 mresult = (struct encoding_map*)result;
8205 mresult->count2 = count2;
8206 mresult->count3 = count3;
8207 mlevel1 = mresult->level1;
8208 mlevel2 = mresult->level23;
8209 mlevel3 = mresult->level23 + 16*count2;
8210 memcpy(mlevel1, level1, 32);
8211 memset(mlevel2, 0xFF, 16*count2);
8212 memset(mlevel3, 0, 128*count3);
8213 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008214 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008215 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008216 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8217 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008218 /* unmapped character */
8219 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008220 o1 = ch>>11;
8221 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008222 i2 = 16*mlevel1[o1] + o2;
8223 if (mlevel2[i2] == 0xFF)
8224 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008225 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008226 i3 = 128*mlevel2[i2] + o3;
8227 mlevel3[i3] = i;
8228 }
8229 return result;
8230}
8231
8232static int
Victor Stinner22168992011-11-20 17:09:18 +01008233encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008234{
8235 struct encoding_map *map = (struct encoding_map*)mapping;
8236 int l1 = c>>11;
8237 int l2 = (c>>7) & 0xF;
8238 int l3 = c & 0x7F;
8239 int i;
8240
Victor Stinner22168992011-11-20 17:09:18 +01008241 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008242 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008243 if (c == 0)
8244 return 0;
8245 /* level 1*/
8246 i = map->level1[l1];
8247 if (i == 0xFF) {
8248 return -1;
8249 }
8250 /* level 2*/
8251 i = map->level23[16*i+l2];
8252 if (i == 0xFF) {
8253 return -1;
8254 }
8255 /* level 3 */
8256 i = map->level23[16*map->count2 + 128*i + l3];
8257 if (i == 0) {
8258 return -1;
8259 }
8260 return i;
8261}
8262
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008263/* Lookup the character ch in the mapping. If the character
8264 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008265 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008266static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008267charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008268{
Christian Heimes217cfd12007-12-02 14:31:20 +00008269 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008270 PyObject *x;
8271
8272 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008273 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008274 x = PyObject_GetItem(mapping, w);
8275 Py_DECREF(w);
8276 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8278 /* No mapping found means: mapping is undefined. */
8279 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008280 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008281 } else
8282 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008284 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008285 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008286 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008287 long value = PyLong_AS_LONG(x);
8288 if (value < 0 || value > 255) {
8289 PyErr_SetString(PyExc_TypeError,
8290 "character mapping must be in range(256)");
8291 Py_DECREF(x);
8292 return NULL;
8293 }
8294 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008296 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008297 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008299 /* wrong return value */
8300 PyErr_Format(PyExc_TypeError,
8301 "character mapping must return integer, bytes or None, not %.400s",
8302 x->ob_type->tp_name);
8303 Py_DECREF(x);
8304 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305 }
8306}
8307
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008308static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008309charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008310{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008311 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8312 /* exponentially overallocate to minimize reallocations */
8313 if (requiredsize < 2*outsize)
8314 requiredsize = 2*outsize;
8315 if (_PyBytes_Resize(outobj, requiredsize))
8316 return -1;
8317 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008318}
8319
Benjamin Peterson14339b62009-01-31 16:36:08 +00008320typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008321 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008322} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008323/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008324 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008325 space is available. Return a new reference to the object that
8326 was put in the output buffer, or Py_None, if the mapping was undefined
8327 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008328 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008329static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008330charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008331 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008332{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008333 PyObject *rep;
8334 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008335 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008336
Christian Heimes90aa7642007-12-19 02:45:37 +00008337 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008338 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008339 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008340 if (res == -1)
8341 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008342 if (outsize<requiredsize)
8343 if (charmapencode_resize(outobj, outpos, requiredsize))
8344 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008345 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 outstart[(*outpos)++] = (char)res;
8347 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008348 }
8349
8350 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008351 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008352 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008353 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 Py_DECREF(rep);
8355 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008356 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008357 if (PyLong_Check(rep)) {
8358 Py_ssize_t requiredsize = *outpos+1;
8359 if (outsize<requiredsize)
8360 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8361 Py_DECREF(rep);
8362 return enc_EXCEPTION;
8363 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008364 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008366 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 else {
8368 const char *repchars = PyBytes_AS_STRING(rep);
8369 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8370 Py_ssize_t requiredsize = *outpos+repsize;
8371 if (outsize<requiredsize)
8372 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8373 Py_DECREF(rep);
8374 return enc_EXCEPTION;
8375 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008376 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 memcpy(outstart + *outpos, repchars, repsize);
8378 *outpos += repsize;
8379 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008380 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008381 Py_DECREF(rep);
8382 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008383}
8384
8385/* handle an error in PyUnicode_EncodeCharmap
8386 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008387static int
8388charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008389 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008390 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008391 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008392 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008393{
8394 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008395 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008396 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008397 enum PyUnicode_Kind kind;
8398 void *data;
8399 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008400 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008401 Py_ssize_t collstartpos = *inpos;
8402 Py_ssize_t collendpos = *inpos+1;
8403 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008404 const char *encoding = "charmap";
8405 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008406 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008407 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008408 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008409
Benjamin Petersonbac79492012-01-14 13:34:47 -05008410 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008411 return -1;
8412 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008413 /* find all unencodable characters */
8414 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008415 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008416 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008417 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008418 val = encoding_map_lookup(ch, mapping);
8419 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008420 break;
8421 ++collendpos;
8422 continue;
8423 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008424
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008425 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8426 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008427 if (rep==NULL)
8428 return -1;
8429 else if (rep!=Py_None) {
8430 Py_DECREF(rep);
8431 break;
8432 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008433 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008434 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008435 }
8436 /* cache callback name lookup
8437 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008438 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008439 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008440
8441 switch (*error_handler) {
8442 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008443 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008444 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008445
8446 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008447 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 x = charmapencode_output('?', mapping, res, respos);
8449 if (x==enc_EXCEPTION) {
8450 return -1;
8451 }
8452 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008453 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 return -1;
8455 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008456 }
8457 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008458 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008459 *inpos = collendpos;
8460 break;
Victor Stinner50149202015-09-22 00:26:54 +02008461
8462 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008463 /* generate replacement (temporarily (mis)uses p) */
8464 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008465 char buffer[2+29+1+1];
8466 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008467 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008468 for (cp = buffer; *cp; ++cp) {
8469 x = charmapencode_output(*cp, mapping, res, respos);
8470 if (x==enc_EXCEPTION)
8471 return -1;
8472 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008473 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 return -1;
8475 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008476 }
8477 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008478 *inpos = collendpos;
8479 break;
Victor Stinner50149202015-09-22 00:26:54 +02008480
Benjamin Peterson14339b62009-01-31 16:36:08 +00008481 default:
Victor Stinner50149202015-09-22 00:26:54 +02008482 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008483 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008484 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008485 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008486 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008487 if (PyBytes_Check(repunicode)) {
8488 /* Directly copy bytes result to output. */
8489 Py_ssize_t outsize = PyBytes_Size(*res);
8490 Py_ssize_t requiredsize;
8491 repsize = PyBytes_Size(repunicode);
8492 requiredsize = *respos + repsize;
8493 if (requiredsize > outsize)
8494 /* Make room for all additional bytes. */
8495 if (charmapencode_resize(res, respos, requiredsize)) {
8496 Py_DECREF(repunicode);
8497 return -1;
8498 }
8499 memcpy(PyBytes_AsString(*res) + *respos,
8500 PyBytes_AsString(repunicode), repsize);
8501 *respos += repsize;
8502 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008503 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008504 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008505 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008506 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008507 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008508 Py_DECREF(repunicode);
8509 return -1;
8510 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008511 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008512 data = PyUnicode_DATA(repunicode);
8513 kind = PyUnicode_KIND(repunicode);
8514 for (index = 0; index < repsize; index++) {
8515 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8516 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008518 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008519 return -1;
8520 }
8521 else if (x==enc_FAILED) {
8522 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008523 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 return -1;
8525 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008526 }
8527 *inpos = newpos;
8528 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008529 }
8530 return 0;
8531}
8532
Alexander Belopolsky40018472011-02-26 01:02:56 +00008533PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008534_PyUnicode_EncodeCharmap(PyObject *unicode,
8535 PyObject *mapping,
8536 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008538 /* output object */
8539 PyObject *res = NULL;
8540 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008541 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008542 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008543 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008544 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008545 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008546 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008547 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008548 void *data;
8549 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008550
Benjamin Petersonbac79492012-01-14 13:34:47 -05008551 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008552 return NULL;
8553 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008554 data = PyUnicode_DATA(unicode);
8555 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008556
Guido van Rossumd57fd912000-03-10 22:53:23 +00008557 /* Default to Latin-1 */
8558 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008559 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008560
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008561 /* allocate enough for a simple encoding without
8562 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008563 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008564 if (res == NULL)
8565 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008566 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008569 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008570 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008572 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 if (x==enc_EXCEPTION) /* error */
8574 goto onError;
8575 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008576 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008578 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008579 &res, &respos)) {
8580 goto onError;
8581 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008582 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 else
8584 /* done with this character => adjust input position */
8585 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008587
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008588 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008589 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008590 if (_PyBytes_Resize(&res, respos) < 0)
8591 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008592
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008593 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008594 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008595 return res;
8596
Benjamin Peterson29060642009-01-31 22:14:21 +00008597 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008598 Py_XDECREF(res);
8599 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008600 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601 return NULL;
8602}
8603
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008604/* Deprecated */
8605PyObject *
8606PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8607 Py_ssize_t size,
8608 PyObject *mapping,
8609 const char *errors)
8610{
8611 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008612 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008613 if (unicode == NULL)
8614 return NULL;
8615 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8616 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008617 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008618}
8619
Alexander Belopolsky40018472011-02-26 01:02:56 +00008620PyObject *
8621PyUnicode_AsCharmapString(PyObject *unicode,
8622 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623{
8624 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008625 PyErr_BadArgument();
8626 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008628 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629}
8630
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008631/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008632static void
8633make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008634 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008635 Py_ssize_t startpos, Py_ssize_t endpos,
8636 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008638 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008639 *exceptionObject = _PyUnicodeTranslateError_Create(
8640 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641 }
8642 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008643 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8644 goto onError;
8645 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8646 goto onError;
8647 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8648 goto onError;
8649 return;
8650 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008651 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652 }
8653}
8654
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008655/* error handling callback helper:
8656 build arguments, call the callback and check the arguments,
8657 put the result into newpos and return the replacement string, which
8658 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008659static PyObject *
8660unicode_translate_call_errorhandler(const char *errors,
8661 PyObject **errorHandler,
8662 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008663 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008664 Py_ssize_t startpos, Py_ssize_t endpos,
8665 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008666{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008667 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008668
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008669 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008670 PyObject *restuple;
8671 PyObject *resunicode;
8672
8673 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008674 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008675 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008676 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008677 }
8678
8679 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008680 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008681 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008683
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008684 restuple = PyObject_CallFunctionObjArgs(
8685 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008686 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008688 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008689 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008690 Py_DECREF(restuple);
8691 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008692 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008693 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 &resunicode, &i_newpos)) {
8695 Py_DECREF(restuple);
8696 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008697 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008698 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008699 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008700 else
8701 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008702 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008703 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 Py_DECREF(restuple);
8705 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008706 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008707 Py_INCREF(resunicode);
8708 Py_DECREF(restuple);
8709 return resunicode;
8710}
8711
8712/* Lookup the character ch in the mapping and put the result in result,
8713 which must be decrefed by the caller.
8714 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008715static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008716charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008717{
Christian Heimes217cfd12007-12-02 14:31:20 +00008718 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008719 PyObject *x;
8720
8721 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008723 x = PyObject_GetItem(mapping, w);
8724 Py_DECREF(w);
8725 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8727 /* No mapping found means: use 1:1 mapping. */
8728 PyErr_Clear();
8729 *result = NULL;
8730 return 0;
8731 } else
8732 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008733 }
8734 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008735 *result = x;
8736 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008737 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008738 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008739 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008740 if (value < 0 || value > MAX_UNICODE) {
8741 PyErr_Format(PyExc_ValueError,
8742 "character mapping must be in range(0x%x)",
8743 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008744 Py_DECREF(x);
8745 return -1;
8746 }
8747 *result = x;
8748 return 0;
8749 }
8750 else if (PyUnicode_Check(x)) {
8751 *result = x;
8752 return 0;
8753 }
8754 else {
8755 /* wrong return value */
8756 PyErr_SetString(PyExc_TypeError,
8757 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008758 Py_DECREF(x);
8759 return -1;
8760 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008761}
Victor Stinner1194ea02014-04-04 19:37:40 +02008762
8763/* lookup the character, write the result into the writer.
8764 Return 1 if the result was written into the writer, return 0 if the mapping
8765 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008766static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008767charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8768 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008769{
Victor Stinner1194ea02014-04-04 19:37:40 +02008770 PyObject *item;
8771
8772 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008774
8775 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008776 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008777 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008778 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008779 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008780 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008781 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008782
8783 if (item == Py_None) {
8784 Py_DECREF(item);
8785 return 0;
8786 }
8787
8788 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008789 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8790 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8791 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008792 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8793 Py_DECREF(item);
8794 return -1;
8795 }
8796 Py_DECREF(item);
8797 return 1;
8798 }
8799
8800 if (!PyUnicode_Check(item)) {
8801 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008802 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008803 }
8804
8805 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8806 Py_DECREF(item);
8807 return -1;
8808 }
8809
8810 Py_DECREF(item);
8811 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008812}
8813
Victor Stinner89a76ab2014-04-05 11:44:04 +02008814static int
8815unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8816 Py_UCS1 *translate)
8817{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008818 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008819 int ret = 0;
8820
Victor Stinner89a76ab2014-04-05 11:44:04 +02008821 if (charmaptranslate_lookup(ch, mapping, &item)) {
8822 return -1;
8823 }
8824
8825 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008826 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008827 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008828 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008829 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008830 /* not found => default to 1:1 mapping */
8831 translate[ch] = ch;
8832 return 1;
8833 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008834 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008835 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008836 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8837 used it */
8838 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008839 /* invalid character or character outside ASCII:
8840 skip the fast translate */
8841 goto exit;
8842 }
8843 translate[ch] = (Py_UCS1)replace;
8844 }
8845 else if (PyUnicode_Check(item)) {
8846 Py_UCS4 replace;
8847
8848 if (PyUnicode_READY(item) == -1) {
8849 Py_DECREF(item);
8850 return -1;
8851 }
8852 if (PyUnicode_GET_LENGTH(item) != 1)
8853 goto exit;
8854
8855 replace = PyUnicode_READ_CHAR(item, 0);
8856 if (replace > 127)
8857 goto exit;
8858 translate[ch] = (Py_UCS1)replace;
8859 }
8860 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008861 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008862 goto exit;
8863 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008864 ret = 1;
8865
Benjamin Peterson1365de72014-04-07 20:15:41 -04008866 exit:
8867 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008868 return ret;
8869}
8870
8871/* Fast path for ascii => ascii translation. Return 1 if the whole string
8872 was translated into writer, return 0 if the input string was partially
8873 translated into writer, raise an exception and return -1 on error. */
8874static int
8875unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008876 _PyUnicodeWriter *writer, int ignore,
8877 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008878{
Victor Stinner872b2912014-04-05 14:27:07 +02008879 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008880 Py_ssize_t len;
8881 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008882 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008883
Victor Stinner89a76ab2014-04-05 11:44:04 +02008884 len = PyUnicode_GET_LENGTH(input);
8885
Victor Stinner872b2912014-04-05 14:27:07 +02008886 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008887
8888 in = PyUnicode_1BYTE_DATA(input);
8889 end = in + len;
8890
8891 assert(PyUnicode_IS_ASCII(writer->buffer));
8892 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8893 out = PyUnicode_1BYTE_DATA(writer->buffer);
8894
Victor Stinner872b2912014-04-05 14:27:07 +02008895 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008896 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008897 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008898 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008899 int translate = unicode_fast_translate_lookup(mapping, ch,
8900 ascii_table);
8901 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008902 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008903 if (translate == 0)
8904 goto exit;
8905 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008906 }
Victor Stinner872b2912014-04-05 14:27:07 +02008907 if (ch2 == 0xfe) {
8908 if (ignore)
8909 continue;
8910 goto exit;
8911 }
8912 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008913 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008914 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008915 }
Victor Stinner872b2912014-04-05 14:27:07 +02008916 res = 1;
8917
8918exit:
8919 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008920 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008921 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008922}
8923
Victor Stinner3222da22015-10-01 22:07:32 +02008924static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008925_PyUnicode_TranslateCharmap(PyObject *input,
8926 PyObject *mapping,
8927 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008928{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008929 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008930 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008931 Py_ssize_t size, i;
8932 int kind;
8933 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008934 _PyUnicodeWriter writer;
8935 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008936 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008937 PyObject *errorHandler = NULL;
8938 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008939 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008940 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008941
Guido van Rossumd57fd912000-03-10 22:53:23 +00008942 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008943 PyErr_BadArgument();
8944 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008945 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008947 if (PyUnicode_READY(input) == -1)
8948 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008949 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008950 kind = PyUnicode_KIND(input);
8951 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008952
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008953 if (size == 0)
8954 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008955
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008956 /* allocate enough for a simple 1:1 translation without
8957 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008958 _PyUnicodeWriter_Init(&writer);
8959 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008960 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008961
Victor Stinner872b2912014-04-05 14:27:07 +02008962 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8963
Victor Stinner33798672016-03-01 21:59:58 +01008964 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008965 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008966 if (PyUnicode_IS_ASCII(input)) {
8967 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8968 if (res < 0) {
8969 _PyUnicodeWriter_Dealloc(&writer);
8970 return NULL;
8971 }
8972 if (res == 1)
8973 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008974 }
Victor Stinner33798672016-03-01 21:59:58 +01008975 else {
8976 i = 0;
8977 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008979 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008980 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008981 int translate;
8982 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8983 Py_ssize_t newpos;
8984 /* startpos for collecting untranslatable chars */
8985 Py_ssize_t collstart;
8986 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008987 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988
Victor Stinner1194ea02014-04-04 19:37:40 +02008989 ch = PyUnicode_READ(kind, data, i);
8990 translate = charmaptranslate_output(ch, mapping, &writer);
8991 if (translate < 0)
8992 goto onError;
8993
8994 if (translate != 0) {
8995 /* it worked => adjust input pointer */
8996 ++i;
8997 continue;
8998 }
8999
9000 /* untranslatable character */
9001 collstart = i;
9002 collend = i+1;
9003
9004 /* find all untranslatable characters */
9005 while (collend < size) {
9006 PyObject *x;
9007 ch = PyUnicode_READ(kind, data, collend);
9008 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009009 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009010 Py_XDECREF(x);
9011 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009012 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009013 ++collend;
9014 }
9015
9016 if (ignore) {
9017 i = collend;
9018 }
9019 else {
9020 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9021 reason, input, &exc,
9022 collstart, collend, &newpos);
9023 if (repunicode == NULL)
9024 goto onError;
9025 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009026 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009027 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009028 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009029 Py_DECREF(repunicode);
9030 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009031 }
9032 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009033 Py_XDECREF(exc);
9034 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009035 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009036
Benjamin Peterson29060642009-01-31 22:14:21 +00009037 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009038 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009039 Py_XDECREF(exc);
9040 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041 return NULL;
9042}
9043
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009044/* Deprecated. Use PyUnicode_Translate instead. */
9045PyObject *
9046PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9047 Py_ssize_t size,
9048 PyObject *mapping,
9049 const char *errors)
9050{
Christian Heimes5f520f42012-09-11 14:03:25 +02009051 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009052 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009053 if (!unicode)
9054 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009055 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9056 Py_DECREF(unicode);
9057 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009058}
9059
Alexander Belopolsky40018472011-02-26 01:02:56 +00009060PyObject *
9061PyUnicode_Translate(PyObject *str,
9062 PyObject *mapping,
9063 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009064{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009065 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009066 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009067 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009068}
Tim Petersced69f82003-09-16 20:30:58 +00009069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009070PyObject *
9071_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9072{
9073 if (!PyUnicode_Check(unicode)) {
9074 PyErr_BadInternalCall();
9075 return NULL;
9076 }
9077 if (PyUnicode_READY(unicode) == -1)
9078 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009079 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009080 /* If the string is already ASCII, just return the same string */
9081 Py_INCREF(unicode);
9082 return unicode;
9083 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009084
9085 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9086 PyObject *result = PyUnicode_New(len, 127);
9087 if (result == NULL) {
9088 return NULL;
9089 }
9090
9091 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9092 int kind = PyUnicode_KIND(unicode);
9093 const void *data = PyUnicode_DATA(unicode);
9094 Py_ssize_t i;
9095 for (i = 0; i < len; ++i) {
9096 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9097 if (ch < 127) {
9098 out[i] = ch;
9099 }
9100 else if (Py_UNICODE_ISSPACE(ch)) {
9101 out[i] = ' ';
9102 }
9103 else {
9104 int decimal = Py_UNICODE_TODECIMAL(ch);
9105 if (decimal < 0) {
9106 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009107 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009108 _PyUnicode_LENGTH(result) = i + 1;
9109 break;
9110 }
9111 out[i] = '0' + decimal;
9112 }
9113 }
9114
INADA Naoki16dfca42018-07-14 12:06:43 +09009115 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009116 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009117}
9118
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009119PyObject *
9120PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9121 Py_ssize_t length)
9122{
Victor Stinnerf0124502011-11-21 23:12:56 +01009123 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009124 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009125 Py_UCS4 maxchar;
9126 enum PyUnicode_Kind kind;
9127 void *data;
9128
Victor Stinner99d7ad02012-02-22 13:37:39 +01009129 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009130 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009131 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009132 if (ch > 127) {
9133 int decimal = Py_UNICODE_TODECIMAL(ch);
9134 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009135 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009136 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009137 }
9138 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009139
9140 /* Copy to a new string */
9141 decimal = PyUnicode_New(length, maxchar);
9142 if (decimal == NULL)
9143 return decimal;
9144 kind = PyUnicode_KIND(decimal);
9145 data = PyUnicode_DATA(decimal);
9146 /* Iterate over code points */
9147 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009148 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009149 if (ch > 127) {
9150 int decimal = Py_UNICODE_TODECIMAL(ch);
9151 if (decimal >= 0)
9152 ch = '0' + decimal;
9153 }
9154 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009155 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009156 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009157}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009158/* --- Decimal Encoder ---------------------------------------------------- */
9159
Alexander Belopolsky40018472011-02-26 01:02:56 +00009160int
9161PyUnicode_EncodeDecimal(Py_UNICODE *s,
9162 Py_ssize_t length,
9163 char *output,
9164 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009165{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009166 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009167 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009168 enum PyUnicode_Kind kind;
9169 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009170
9171 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009172 PyErr_BadArgument();
9173 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009174 }
9175
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009176 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009177 if (unicode == NULL)
9178 return -1;
9179
Victor Stinner42bf7752011-11-21 22:52:58 +01009180 kind = PyUnicode_KIND(unicode);
9181 data = PyUnicode_DATA(unicode);
9182
Victor Stinnerb84d7232011-11-22 01:50:07 +01009183 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009184 PyObject *exc;
9185 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009186 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009187 Py_ssize_t startpos;
9188
9189 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009190
Benjamin Peterson29060642009-01-31 22:14:21 +00009191 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009192 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009193 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009194 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009195 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009196 decimal = Py_UNICODE_TODECIMAL(ch);
9197 if (decimal >= 0) {
9198 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009199 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009200 continue;
9201 }
9202 if (0 < ch && ch < 256) {
9203 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009204 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009205 continue;
9206 }
Victor Stinner6345be92011-11-25 20:09:01 +01009207
Victor Stinner42bf7752011-11-21 22:52:58 +01009208 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009209 exc = NULL;
9210 raise_encode_exception(&exc, "decimal", unicode,
9211 startpos, startpos+1,
9212 "invalid decimal Unicode string");
9213 Py_XDECREF(exc);
9214 Py_DECREF(unicode);
9215 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009216 }
9217 /* 0-terminate the output string */
9218 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009219 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009220 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009221}
9222
Guido van Rossumd57fd912000-03-10 22:53:23 +00009223/* --- Helpers ------------------------------------------------------------ */
9224
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009225/* helper macro to fixup start/end slice values */
9226#define ADJUST_INDICES(start, end, len) \
9227 if (end > len) \
9228 end = len; \
9229 else if (end < 0) { \
9230 end += len; \
9231 if (end < 0) \
9232 end = 0; \
9233 } \
9234 if (start < 0) { \
9235 start += len; \
9236 if (start < 0) \
9237 start = 0; \
9238 }
9239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009240static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009241any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009243 Py_ssize_t end,
9244 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009245{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009246 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009247 void *buf1, *buf2;
9248 Py_ssize_t len1, len2, result;
9249
9250 kind1 = PyUnicode_KIND(s1);
9251 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009252 if (kind1 < kind2)
9253 return -1;
9254
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009255 len1 = PyUnicode_GET_LENGTH(s1);
9256 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009257 ADJUST_INDICES(start, end, len1);
9258 if (end - start < len2)
9259 return -1;
9260
9261 buf1 = PyUnicode_DATA(s1);
9262 buf2 = PyUnicode_DATA(s2);
9263 if (len2 == 1) {
9264 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9265 result = findchar((const char *)buf1 + kind1*start,
9266 kind1, end - start, ch, direction);
9267 if (result == -1)
9268 return -1;
9269 else
9270 return start + result;
9271 }
9272
9273 if (kind2 != kind1) {
9274 buf2 = _PyUnicode_AsKind(s2, kind1);
9275 if (!buf2)
9276 return -2;
9277 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009278
Victor Stinner794d5672011-10-10 03:21:36 +02009279 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009280 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009281 case PyUnicode_1BYTE_KIND:
9282 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9283 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9284 else
9285 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9286 break;
9287 case PyUnicode_2BYTE_KIND:
9288 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9289 break;
9290 case PyUnicode_4BYTE_KIND:
9291 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9292 break;
9293 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009294 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009295 }
9296 }
9297 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009298 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009299 case PyUnicode_1BYTE_KIND:
9300 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9301 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9302 else
9303 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9304 break;
9305 case PyUnicode_2BYTE_KIND:
9306 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9307 break;
9308 case PyUnicode_4BYTE_KIND:
9309 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9310 break;
9311 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009312 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009313 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009314 }
9315
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009316 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009317 PyMem_Free(buf2);
9318
9319 return result;
9320}
9321
9322Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009323_PyUnicode_InsertThousandsGrouping(
9324 PyObject *unicode, Py_ssize_t index,
9325 Py_ssize_t n_buffer,
9326 void *digits, Py_ssize_t n_digits,
9327 Py_ssize_t min_width,
9328 const char *grouping, PyObject *thousands_sep,
9329 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009330{
Victor Stinner41a863c2012-02-24 00:37:51 +01009331 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009332 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009333 Py_ssize_t thousands_sep_len;
9334 Py_ssize_t len;
9335
9336 if (unicode != NULL) {
9337 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009338 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009339 }
9340 else {
9341 kind = PyUnicode_1BYTE_KIND;
9342 data = NULL;
9343 }
9344 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9345 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9346 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9347 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009348 if (thousands_sep_kind < kind) {
9349 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9350 if (!thousands_sep_data)
9351 return -1;
9352 }
9353 else {
9354 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9355 if (!data)
9356 return -1;
9357 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009358 }
9359
Benjamin Petersonead6b532011-12-20 17:23:42 -06009360 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009361 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009362 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009363 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009364 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009365 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009366 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009367 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009368 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009369 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009370 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009371 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009372 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009373 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009374 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009375 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009376 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009377 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009378 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009379 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009380 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009381 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009382 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009383 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009384 break;
9385 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009386 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009387 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009388 if (unicode != NULL && thousands_sep_kind != kind) {
9389 if (thousands_sep_kind < kind)
9390 PyMem_Free(thousands_sep_data);
9391 else
9392 PyMem_Free(data);
9393 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009394 if (unicode == NULL) {
9395 *maxchar = 127;
9396 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009397 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009398 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009399 }
9400 }
9401 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402}
9403
9404
Alexander Belopolsky40018472011-02-26 01:02:56 +00009405Py_ssize_t
9406PyUnicode_Count(PyObject *str,
9407 PyObject *substr,
9408 Py_ssize_t start,
9409 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009410{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009411 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009412 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009413 void *buf1 = NULL, *buf2 = NULL;
9414 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009415
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009416 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009417 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009418
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009419 kind1 = PyUnicode_KIND(str);
9420 kind2 = PyUnicode_KIND(substr);
9421 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009422 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009423
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009424 len1 = PyUnicode_GET_LENGTH(str);
9425 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009426 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009427 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009428 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009429
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009430 buf1 = PyUnicode_DATA(str);
9431 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009432 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009433 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009434 if (!buf2)
9435 goto onError;
9436 }
9437
9438 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009440 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009441 result = asciilib_count(
9442 ((Py_UCS1*)buf1) + start, end - start,
9443 buf2, len2, PY_SSIZE_T_MAX
9444 );
9445 else
9446 result = ucs1lib_count(
9447 ((Py_UCS1*)buf1) + start, end - start,
9448 buf2, len2, PY_SSIZE_T_MAX
9449 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 break;
9451 case PyUnicode_2BYTE_KIND:
9452 result = ucs2lib_count(
9453 ((Py_UCS2*)buf1) + start, end - start,
9454 buf2, len2, PY_SSIZE_T_MAX
9455 );
9456 break;
9457 case PyUnicode_4BYTE_KIND:
9458 result = ucs4lib_count(
9459 ((Py_UCS4*)buf1) + start, end - start,
9460 buf2, len2, PY_SSIZE_T_MAX
9461 );
9462 break;
9463 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009464 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009466
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009467 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 PyMem_Free(buf2);
9469
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009471 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009472 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473 PyMem_Free(buf2);
9474 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475}
9476
Alexander Belopolsky40018472011-02-26 01:02:56 +00009477Py_ssize_t
9478PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009479 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009480 Py_ssize_t start,
9481 Py_ssize_t end,
9482 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009483{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009484 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009485 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009486
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009487 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009488}
9489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490Py_ssize_t
9491PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9492 Py_ssize_t start, Py_ssize_t end,
9493 int direction)
9494{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009495 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009496 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009497 if (PyUnicode_READY(str) == -1)
9498 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009499 len = PyUnicode_GET_LENGTH(str);
9500 ADJUST_INDICES(start, end, len);
9501 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009502 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009504 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9505 kind, end-start, ch, direction);
9506 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009507 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009508 else
9509 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009510}
9511
Alexander Belopolsky40018472011-02-26 01:02:56 +00009512static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009513tailmatch(PyObject *self,
9514 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009515 Py_ssize_t start,
9516 Py_ssize_t end,
9517 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009519 int kind_self;
9520 int kind_sub;
9521 void *data_self;
9522 void *data_sub;
9523 Py_ssize_t offset;
9524 Py_ssize_t i;
9525 Py_ssize_t end_sub;
9526
9527 if (PyUnicode_READY(self) == -1 ||
9528 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009529 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009530
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009531 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9532 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009533 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009534 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009535
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009536 if (PyUnicode_GET_LENGTH(substring) == 0)
9537 return 1;
9538
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009539 kind_self = PyUnicode_KIND(self);
9540 data_self = PyUnicode_DATA(self);
9541 kind_sub = PyUnicode_KIND(substring);
9542 data_sub = PyUnicode_DATA(substring);
9543 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9544
9545 if (direction > 0)
9546 offset = end;
9547 else
9548 offset = start;
9549
9550 if (PyUnicode_READ(kind_self, data_self, offset) ==
9551 PyUnicode_READ(kind_sub, data_sub, 0) &&
9552 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9553 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9554 /* If both are of the same kind, memcmp is sufficient */
9555 if (kind_self == kind_sub) {
9556 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009557 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009558 data_sub,
9559 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009560 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009561 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009562 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009563 else {
9564 /* We do not need to compare 0 and len(substring)-1 because
9565 the if statement above ensured already that they are equal
9566 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009567 for (i = 1; i < end_sub; ++i) {
9568 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9569 PyUnicode_READ(kind_sub, data_sub, i))
9570 return 0;
9571 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009572 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009573 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009574 }
9575
9576 return 0;
9577}
9578
Alexander Belopolsky40018472011-02-26 01:02:56 +00009579Py_ssize_t
9580PyUnicode_Tailmatch(PyObject *str,
9581 PyObject *substr,
9582 Py_ssize_t start,
9583 Py_ssize_t end,
9584 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009585{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009586 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009587 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009588
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009589 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009590}
9591
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009592static PyObject *
9593ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009594{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009595 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9596 char *resdata, *data = PyUnicode_DATA(self);
9597 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009598
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009599 res = PyUnicode_New(len, 127);
9600 if (res == NULL)
9601 return NULL;
9602 resdata = PyUnicode_DATA(res);
9603 if (lower)
9604 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009605 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009606 _Py_bytes_upper(resdata, data, len);
9607 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009608}
9609
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009610static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009611handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009612{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009613 Py_ssize_t j;
9614 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009615 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009616 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009617
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009618 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9619
9620 where ! is a negation and \p{xxx} is a character with property xxx.
9621 */
9622 for (j = i - 1; j >= 0; j--) {
9623 c = PyUnicode_READ(kind, data, j);
9624 if (!_PyUnicode_IsCaseIgnorable(c))
9625 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009626 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009627 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9628 if (final_sigma) {
9629 for (j = i + 1; j < length; j++) {
9630 c = PyUnicode_READ(kind, data, j);
9631 if (!_PyUnicode_IsCaseIgnorable(c))
9632 break;
9633 }
9634 final_sigma = j == length || !_PyUnicode_IsCased(c);
9635 }
9636 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637}
9638
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009639static int
9640lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9641 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009642{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009643 /* Obscure special case. */
9644 if (c == 0x3A3) {
9645 mapped[0] = handle_capital_sigma(kind, data, length, i);
9646 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009647 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009648 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009649}
9650
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009651static Py_ssize_t
9652do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009653{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009654 Py_ssize_t i, k = 0;
9655 int n_res, j;
9656 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009657
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009658 c = PyUnicode_READ(kind, data, 0);
9659 n_res = _PyUnicode_ToUpperFull(c, mapped);
9660 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009661 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009662 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009663 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009664 for (i = 1; i < length; i++) {
9665 c = PyUnicode_READ(kind, data, i);
9666 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9667 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009668 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009669 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009670 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009671 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009672 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009673}
9674
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009675static Py_ssize_t
9676do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9677 Py_ssize_t i, k = 0;
9678
9679 for (i = 0; i < length; i++) {
9680 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9681 int n_res, j;
9682 if (Py_UNICODE_ISUPPER(c)) {
9683 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9684 }
9685 else if (Py_UNICODE_ISLOWER(c)) {
9686 n_res = _PyUnicode_ToUpperFull(c, mapped);
9687 }
9688 else {
9689 n_res = 1;
9690 mapped[0] = c;
9691 }
9692 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009693 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009694 res[k++] = mapped[j];
9695 }
9696 }
9697 return k;
9698}
9699
9700static Py_ssize_t
9701do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9702 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009703{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009704 Py_ssize_t i, k = 0;
9705
9706 for (i = 0; i < length; i++) {
9707 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9708 int n_res, j;
9709 if (lower)
9710 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9711 else
9712 n_res = _PyUnicode_ToUpperFull(c, mapped);
9713 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009714 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009715 res[k++] = mapped[j];
9716 }
9717 }
9718 return k;
9719}
9720
9721static Py_ssize_t
9722do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9723{
9724 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9725}
9726
9727static Py_ssize_t
9728do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9729{
9730 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9731}
9732
Benjamin Petersone51757f2012-01-12 21:10:29 -05009733static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009734do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9735{
9736 Py_ssize_t i, k = 0;
9737
9738 for (i = 0; i < length; i++) {
9739 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9740 Py_UCS4 mapped[3];
9741 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9742 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009743 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009744 res[k++] = mapped[j];
9745 }
9746 }
9747 return k;
9748}
9749
9750static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009751do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9752{
9753 Py_ssize_t i, k = 0;
9754 int previous_is_cased;
9755
9756 previous_is_cased = 0;
9757 for (i = 0; i < length; i++) {
9758 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9759 Py_UCS4 mapped[3];
9760 int n_res, j;
9761
9762 if (previous_is_cased)
9763 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9764 else
9765 n_res = _PyUnicode_ToTitleFull(c, mapped);
9766
9767 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009768 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009769 res[k++] = mapped[j];
9770 }
9771
9772 previous_is_cased = _PyUnicode_IsCased(c);
9773 }
9774 return k;
9775}
9776
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009777static PyObject *
9778case_operation(PyObject *self,
9779 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9780{
9781 PyObject *res = NULL;
9782 Py_ssize_t length, newlength = 0;
9783 int kind, outkind;
9784 void *data, *outdata;
9785 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9786
Benjamin Petersoneea48462012-01-16 14:28:50 -05009787 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009788
9789 kind = PyUnicode_KIND(self);
9790 data = PyUnicode_DATA(self);
9791 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009792 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009793 PyErr_SetString(PyExc_OverflowError, "string is too long");
9794 return NULL;
9795 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009796 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009797 if (tmp == NULL)
9798 return PyErr_NoMemory();
9799 newlength = perform(kind, data, length, tmp, &maxchar);
9800 res = PyUnicode_New(newlength, maxchar);
9801 if (res == NULL)
9802 goto leave;
9803 tmpend = tmp + newlength;
9804 outdata = PyUnicode_DATA(res);
9805 outkind = PyUnicode_KIND(res);
9806 switch (outkind) {
9807 case PyUnicode_1BYTE_KIND:
9808 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9809 break;
9810 case PyUnicode_2BYTE_KIND:
9811 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9812 break;
9813 case PyUnicode_4BYTE_KIND:
9814 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9815 break;
9816 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009817 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009818 }
9819 leave:
9820 PyMem_FREE(tmp);
9821 return res;
9822}
9823
Tim Peters8ce9f162004-08-27 01:49:32 +00009824PyObject *
9825PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009826{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009827 PyObject *res;
9828 PyObject *fseq;
9829 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009830 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009831
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009832 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009833 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009834 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009835 }
9836
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009837 /* NOTE: the following code can't call back into Python code,
9838 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009839 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009840
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009841 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009842 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009843 res = _PyUnicode_JoinArray(separator, items, seqlen);
9844 Py_DECREF(fseq);
9845 return res;
9846}
9847
9848PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +02009849_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009850{
9851 PyObject *res = NULL; /* the result */
9852 PyObject *sep = NULL;
9853 Py_ssize_t seplen;
9854 PyObject *item;
9855 Py_ssize_t sz, i, res_offset;
9856 Py_UCS4 maxchar;
9857 Py_UCS4 item_maxchar;
9858 int use_memcpy;
9859 unsigned char *res_data = NULL, *sep_data = NULL;
9860 PyObject *last_obj;
9861 unsigned int kind = 0;
9862
Tim Peters05eba1f2004-08-27 21:32:02 +00009863 /* If empty sequence, return u"". */
9864 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009865 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009866 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009867
Tim Peters05eba1f2004-08-27 21:32:02 +00009868 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009869 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009870 if (seqlen == 1) {
9871 if (PyUnicode_CheckExact(items[0])) {
9872 res = items[0];
9873 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009874 return res;
9875 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009876 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009877 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009878 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009879 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009880 /* Set up sep and seplen */
9881 if (separator == NULL) {
9882 /* fall back to a blank space separator */
9883 sep = PyUnicode_FromOrdinal(' ');
9884 if (!sep)
9885 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009886 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009887 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009888 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009889 else {
9890 if (!PyUnicode_Check(separator)) {
9891 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009892 "separator: expected str instance,"
9893 " %.80s found",
9894 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +02009895 goto onError;
9896 }
9897 if (PyUnicode_READY(separator))
9898 goto onError;
9899 sep = separator;
9900 seplen = PyUnicode_GET_LENGTH(separator);
9901 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9902 /* inc refcount to keep this code path symmetric with the
9903 above case of a blank separator */
9904 Py_INCREF(sep);
9905 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009906 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009907 }
9908
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009909 /* There are at least two things to join, or else we have a subclass
9910 * of str in the sequence.
9911 * Do a pre-pass to figure out the total amount of space we'll
9912 * need (sz), and see whether all argument are strings.
9913 */
9914 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009915#ifdef Py_DEBUG
9916 use_memcpy = 0;
9917#else
9918 use_memcpy = 1;
9919#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009920 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +08009921 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009922 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009923 if (!PyUnicode_Check(item)) {
9924 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009925 "sequence item %zd: expected str instance,"
9926 " %.80s found",
9927 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009928 goto onError;
9929 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009930 if (PyUnicode_READY(item) == -1)
9931 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +08009932 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009933 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009934 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +08009935 if (i != 0) {
9936 add_sz += seplen;
9937 }
9938 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009939 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009940 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009941 goto onError;
9942 }
Xiang Zhangb0541f42017-01-10 10:52:00 +08009943 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +02009944 if (use_memcpy && last_obj != NULL) {
9945 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9946 use_memcpy = 0;
9947 }
9948 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009949 }
Tim Petersced69f82003-09-16 20:30:58 +00009950
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009951 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009952 if (res == NULL)
9953 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009954
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009955 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009956#ifdef Py_DEBUG
9957 use_memcpy = 0;
9958#else
9959 if (use_memcpy) {
9960 res_data = PyUnicode_1BYTE_DATA(res);
9961 kind = PyUnicode_KIND(res);
9962 if (seplen != 0)
9963 sep_data = PyUnicode_1BYTE_DATA(sep);
9964 }
9965#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009966 if (use_memcpy) {
9967 for (i = 0; i < seqlen; ++i) {
9968 Py_ssize_t itemlen;
9969 item = items[i];
9970
9971 /* Copy item, and maybe the separator. */
9972 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +02009973 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +02009974 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009975 kind * seplen);
9976 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009977 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009978
9979 itemlen = PyUnicode_GET_LENGTH(item);
9980 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +02009981 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +02009982 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009983 kind * itemlen);
9984 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009985 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009986 }
9987 assert(res_data == PyUnicode_1BYTE_DATA(res)
9988 + kind * PyUnicode_GET_LENGTH(res));
9989 }
9990 else {
9991 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9992 Py_ssize_t itemlen;
9993 item = items[i];
9994
9995 /* Copy item, and maybe the separator. */
9996 if (i && seplen != 0) {
9997 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9998 res_offset += seplen;
9999 }
10000
10001 itemlen = PyUnicode_GET_LENGTH(item);
10002 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010003 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010004 res_offset += itemlen;
10005 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010006 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010007 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010008 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010009
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010010 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010011 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010012 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010013
Benjamin Peterson29060642009-01-31 22:14:21 +000010014 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010015 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010016 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010017 return NULL;
10018}
10019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010020#define FILL(kind, data, value, start, length) \
10021 do { \
10022 Py_ssize_t i_ = 0; \
10023 assert(kind != PyUnicode_WCHAR_KIND); \
10024 switch ((kind)) { \
10025 case PyUnicode_1BYTE_KIND: { \
10026 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010027 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010028 break; \
10029 } \
10030 case PyUnicode_2BYTE_KIND: { \
10031 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10032 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10033 break; \
10034 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010035 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010036 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10037 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10038 break; \
10039 } \
Barry Warsawb2e57942017-09-14 18:13:16 -070010040 default: Py_UNREACHABLE(); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010041 } \
10042 } while (0)
10043
Victor Stinnerd3f08822012-05-29 12:57:52 +020010044void
10045_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10046 Py_UCS4 fill_char)
10047{
10048 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10049 const void *data = PyUnicode_DATA(unicode);
10050 assert(PyUnicode_IS_READY(unicode));
10051 assert(unicode_modifiable(unicode));
10052 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10053 assert(start >= 0);
10054 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10055 FILL(kind, data, fill_char, start, length);
10056}
10057
Victor Stinner3fe55312012-01-04 00:33:50 +010010058Py_ssize_t
10059PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10060 Py_UCS4 fill_char)
10061{
10062 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010063
10064 if (!PyUnicode_Check(unicode)) {
10065 PyErr_BadInternalCall();
10066 return -1;
10067 }
10068 if (PyUnicode_READY(unicode) == -1)
10069 return -1;
10070 if (unicode_check_modifiable(unicode))
10071 return -1;
10072
Victor Stinnerd3f08822012-05-29 12:57:52 +020010073 if (start < 0) {
10074 PyErr_SetString(PyExc_IndexError, "string index out of range");
10075 return -1;
10076 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010077 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10078 PyErr_SetString(PyExc_ValueError,
10079 "fill character is bigger than "
10080 "the string maximum character");
10081 return -1;
10082 }
10083
10084 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10085 length = Py_MIN(maxlen, length);
10086 if (length <= 0)
10087 return 0;
10088
Victor Stinnerd3f08822012-05-29 12:57:52 +020010089 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010090 return length;
10091}
10092
Victor Stinner9310abb2011-10-05 00:59:23 +020010093static PyObject *
10094pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010095 Py_ssize_t left,
10096 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010098{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010099 PyObject *u;
10100 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010101 int kind;
10102 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010103
10104 if (left < 0)
10105 left = 0;
10106 if (right < 0)
10107 right = 0;
10108
Victor Stinnerc4b49542011-12-11 22:44:26 +010010109 if (left == 0 && right == 0)
10110 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010112 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10113 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010114 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10115 return NULL;
10116 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010117 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010118 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010120 if (!u)
10121 return NULL;
10122
10123 kind = PyUnicode_KIND(u);
10124 data = PyUnicode_DATA(u);
10125 if (left)
10126 FILL(kind, data, fill, 0, left);
10127 if (right)
10128 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010129 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010130 assert(_PyUnicode_CheckConsistency(u, 1));
10131 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010132}
10133
Alexander Belopolsky40018472011-02-26 01:02:56 +000010134PyObject *
10135PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010136{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010137 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010138
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010139 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010140 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010141
Benjamin Petersonead6b532011-12-20 17:23:42 -060010142 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010144 if (PyUnicode_IS_ASCII(string))
10145 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010146 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010147 PyUnicode_GET_LENGTH(string), keepends);
10148 else
10149 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010150 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010151 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 break;
10153 case PyUnicode_2BYTE_KIND:
10154 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010155 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 PyUnicode_GET_LENGTH(string), keepends);
10157 break;
10158 case PyUnicode_4BYTE_KIND:
10159 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010160 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 PyUnicode_GET_LENGTH(string), keepends);
10162 break;
10163 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010164 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010166 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010167}
10168
Alexander Belopolsky40018472011-02-26 01:02:56 +000010169static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010170split(PyObject *self,
10171 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010172 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010173{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010174 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 void *buf1, *buf2;
10176 Py_ssize_t len1, len2;
10177 PyObject* out;
10178
Guido van Rossumd57fd912000-03-10 22:53:23 +000010179 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010180 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010181
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 if (PyUnicode_READY(self) == -1)
10183 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010184
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010186 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010188 if (PyUnicode_IS_ASCII(self))
10189 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010190 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010191 PyUnicode_GET_LENGTH(self), maxcount
10192 );
10193 else
10194 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010195 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010196 PyUnicode_GET_LENGTH(self), maxcount
10197 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010198 case PyUnicode_2BYTE_KIND:
10199 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010200 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010201 PyUnicode_GET_LENGTH(self), maxcount
10202 );
10203 case PyUnicode_4BYTE_KIND:
10204 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010205 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 PyUnicode_GET_LENGTH(self), maxcount
10207 );
10208 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010209 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 }
10211
10212 if (PyUnicode_READY(substring) == -1)
10213 return NULL;
10214
10215 kind1 = PyUnicode_KIND(self);
10216 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 len1 = PyUnicode_GET_LENGTH(self);
10218 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010219 if (kind1 < kind2 || len1 < len2) {
10220 out = PyList_New(1);
10221 if (out == NULL)
10222 return NULL;
10223 Py_INCREF(self);
10224 PyList_SET_ITEM(out, 0, self);
10225 return out;
10226 }
10227 buf1 = PyUnicode_DATA(self);
10228 buf2 = PyUnicode_DATA(substring);
10229 if (kind2 != kind1) {
10230 buf2 = _PyUnicode_AsKind(substring, kind1);
10231 if (!buf2)
10232 return NULL;
10233 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010234
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010235 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010236 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010237 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10238 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010239 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010240 else
10241 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010242 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 break;
10244 case PyUnicode_2BYTE_KIND:
10245 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010246 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 break;
10248 case PyUnicode_4BYTE_KIND:
10249 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010250 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010251 break;
10252 default:
10253 out = NULL;
10254 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010255 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 PyMem_Free(buf2);
10257 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010258}
10259
Alexander Belopolsky40018472011-02-26 01:02:56 +000010260static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010261rsplit(PyObject *self,
10262 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010263 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010264{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010265 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010266 void *buf1, *buf2;
10267 Py_ssize_t len1, len2;
10268 PyObject* out;
10269
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010270 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010271 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010273 if (PyUnicode_READY(self) == -1)
10274 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010275
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010276 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010277 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010278 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010279 if (PyUnicode_IS_ASCII(self))
10280 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010281 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010282 PyUnicode_GET_LENGTH(self), maxcount
10283 );
10284 else
10285 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010286 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010287 PyUnicode_GET_LENGTH(self), maxcount
10288 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 case PyUnicode_2BYTE_KIND:
10290 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010291 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 PyUnicode_GET_LENGTH(self), maxcount
10293 );
10294 case PyUnicode_4BYTE_KIND:
10295 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010296 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 PyUnicode_GET_LENGTH(self), maxcount
10298 );
10299 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010300 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 }
10302
10303 if (PyUnicode_READY(substring) == -1)
10304 return NULL;
10305
10306 kind1 = PyUnicode_KIND(self);
10307 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010308 len1 = PyUnicode_GET_LENGTH(self);
10309 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010310 if (kind1 < kind2 || len1 < len2) {
10311 out = PyList_New(1);
10312 if (out == NULL)
10313 return NULL;
10314 Py_INCREF(self);
10315 PyList_SET_ITEM(out, 0, self);
10316 return out;
10317 }
10318 buf1 = PyUnicode_DATA(self);
10319 buf2 = PyUnicode_DATA(substring);
10320 if (kind2 != kind1) {
10321 buf2 = _PyUnicode_AsKind(substring, kind1);
10322 if (!buf2)
10323 return NULL;
10324 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010326 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010327 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010328 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10329 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010330 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010331 else
10332 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010333 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 break;
10335 case PyUnicode_2BYTE_KIND:
10336 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010337 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 break;
10339 case PyUnicode_4BYTE_KIND:
10340 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010341 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 break;
10343 default:
10344 out = NULL;
10345 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010346 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 PyMem_Free(buf2);
10348 return out;
10349}
10350
10351static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010352anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10353 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010355 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010357 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10358 return asciilib_find(buf1, len1, buf2, len2, offset);
10359 else
10360 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 case PyUnicode_2BYTE_KIND:
10362 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10363 case PyUnicode_4BYTE_KIND:
10364 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10365 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010366 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010367}
10368
10369static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010370anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10371 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010372{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010373 switch (kind) {
10374 case PyUnicode_1BYTE_KIND:
10375 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10376 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10377 else
10378 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10379 case PyUnicode_2BYTE_KIND:
10380 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10381 case PyUnicode_4BYTE_KIND:
10382 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10383 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010384 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010385}
10386
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010387static void
10388replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10389 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10390{
10391 int kind = PyUnicode_KIND(u);
10392 void *data = PyUnicode_DATA(u);
10393 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10394 if (kind == PyUnicode_1BYTE_KIND) {
10395 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10396 (Py_UCS1 *)data + len,
10397 u1, u2, maxcount);
10398 }
10399 else if (kind == PyUnicode_2BYTE_KIND) {
10400 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10401 (Py_UCS2 *)data + len,
10402 u1, u2, maxcount);
10403 }
10404 else {
10405 assert(kind == PyUnicode_4BYTE_KIND);
10406 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10407 (Py_UCS4 *)data + len,
10408 u1, u2, maxcount);
10409 }
10410}
10411
Alexander Belopolsky40018472011-02-26 01:02:56 +000010412static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413replace(PyObject *self, PyObject *str1,
10414 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010415{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 PyObject *u;
10417 char *sbuf = PyUnicode_DATA(self);
10418 char *buf1 = PyUnicode_DATA(str1);
10419 char *buf2 = PyUnicode_DATA(str2);
10420 int srelease = 0, release1 = 0, release2 = 0;
10421 int skind = PyUnicode_KIND(self);
10422 int kind1 = PyUnicode_KIND(str1);
10423 int kind2 = PyUnicode_KIND(str2);
10424 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10425 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10426 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010427 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010428 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010429
10430 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010431 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010433 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010434
Victor Stinner59de0ee2011-10-07 10:01:28 +020010435 if (str1 == str2)
10436 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437
Victor Stinner49a0a212011-10-12 23:46:10 +020010438 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010439 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10440 if (maxchar < maxchar_str1)
10441 /* substring too wide to be present */
10442 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010443 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10444 /* Replacing str1 with str2 may cause a maxchar reduction in the
10445 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010446 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010447 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010450 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010452 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010454 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010455 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010456 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010457
Victor Stinner69ed0f42013-04-09 21:48:24 +020010458 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010459 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010460 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010461 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010462 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010464 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010465 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010466
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010467 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10468 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010469 }
10470 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471 int rkind = skind;
10472 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010473 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475 if (kind1 < rkind) {
10476 /* widen substring */
10477 buf1 = _PyUnicode_AsKind(str1, rkind);
10478 if (!buf1) goto error;
10479 release1 = 1;
10480 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010481 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010482 if (i < 0)
10483 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484 if (rkind > kind2) {
10485 /* widen replacement */
10486 buf2 = _PyUnicode_AsKind(str2, rkind);
10487 if (!buf2) goto error;
10488 release2 = 1;
10489 }
10490 else if (rkind < kind2) {
10491 /* widen self and buf1 */
10492 rkind = kind2;
10493 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010494 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 sbuf = _PyUnicode_AsKind(self, rkind);
10496 if (!sbuf) goto error;
10497 srelease = 1;
10498 buf1 = _PyUnicode_AsKind(str1, rkind);
10499 if (!buf1) goto error;
10500 release1 = 1;
10501 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010502 u = PyUnicode_New(slen, maxchar);
10503 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010504 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010505 assert(PyUnicode_KIND(u) == rkind);
10506 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010507
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010508 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010509 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010510 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010512 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010514
10515 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010516 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010517 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010518 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010519 if (i == -1)
10520 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010521 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010523 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010525 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010526 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010527 }
10528 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010529 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010530 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010531 int rkind = skind;
10532 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010535 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 buf1 = _PyUnicode_AsKind(str1, rkind);
10537 if (!buf1) goto error;
10538 release1 = 1;
10539 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010540 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010541 if (n == 0)
10542 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010544 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010545 buf2 = _PyUnicode_AsKind(str2, rkind);
10546 if (!buf2) goto error;
10547 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010548 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010549 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010550 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551 rkind = kind2;
10552 sbuf = _PyUnicode_AsKind(self, rkind);
10553 if (!sbuf) goto error;
10554 srelease = 1;
10555 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010556 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010557 buf1 = _PyUnicode_AsKind(str1, rkind);
10558 if (!buf1) goto error;
10559 release1 = 1;
10560 }
10561 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10562 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010563 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 PyErr_SetString(PyExc_OverflowError,
10565 "replace string is too long");
10566 goto error;
10567 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010568 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010569 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010570 _Py_INCREF_UNICODE_EMPTY();
10571 if (!unicode_empty)
10572 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010573 u = unicode_empty;
10574 goto done;
10575 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010576 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010577 PyErr_SetString(PyExc_OverflowError,
10578 "replace string is too long");
10579 goto error;
10580 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010581 u = PyUnicode_New(new_size, maxchar);
10582 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010583 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010584 assert(PyUnicode_KIND(u) == rkind);
10585 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010586 ires = i = 0;
10587 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010588 while (n-- > 0) {
10589 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010590 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010591 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010592 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010593 if (j == -1)
10594 break;
10595 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010596 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010597 memcpy(res + rkind * ires,
10598 sbuf + rkind * i,
10599 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010601 }
10602 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010604 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010606 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010608 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010610 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010612 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010613 memcpy(res + rkind * ires,
10614 sbuf + rkind * i,
10615 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010616 }
10617 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010618 /* interleave */
10619 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010620 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010622 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010624 if (--n <= 0)
10625 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010626 memcpy(res + rkind * ires,
10627 sbuf + rkind * i,
10628 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010629 ires++;
10630 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010631 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010632 memcpy(res + rkind * ires,
10633 sbuf + rkind * i,
10634 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010635 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010636 }
10637
10638 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010639 unicode_adjust_maxchar(&u);
10640 if (u == NULL)
10641 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010642 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010643
10644 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645 if (srelease)
10646 PyMem_FREE(sbuf);
10647 if (release1)
10648 PyMem_FREE(buf1);
10649 if (release2)
10650 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010651 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010653
Benjamin Peterson29060642009-01-31 22:14:21 +000010654 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010655 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 if (srelease)
10657 PyMem_FREE(sbuf);
10658 if (release1)
10659 PyMem_FREE(buf1);
10660 if (release2)
10661 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010662 return unicode_result_unchanged(self);
10663
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 error:
10665 if (srelease && sbuf)
10666 PyMem_FREE(sbuf);
10667 if (release1 && buf1)
10668 PyMem_FREE(buf1);
10669 if (release2 && buf2)
10670 PyMem_FREE(buf2);
10671 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010672}
10673
10674/* --- Unicode Object Methods --------------------------------------------- */
10675
INADA Naoki3ae20562017-01-16 20:41:20 +090010676/*[clinic input]
10677str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010678
INADA Naoki3ae20562017-01-16 20:41:20 +090010679Return a version of the string where each word is titlecased.
10680
10681More specifically, words start with uppercased characters and all remaining
10682cased characters have lower case.
10683[clinic start generated code]*/
10684
10685static PyObject *
10686unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010687/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010688{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010689 if (PyUnicode_READY(self) == -1)
10690 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010691 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010692}
10693
INADA Naoki3ae20562017-01-16 20:41:20 +090010694/*[clinic input]
10695str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010696
INADA Naoki3ae20562017-01-16 20:41:20 +090010697Return a capitalized version of the string.
10698
10699More specifically, make the first character have upper case and the rest lower
10700case.
10701[clinic start generated code]*/
10702
10703static PyObject *
10704unicode_capitalize_impl(PyObject *self)
10705/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010706{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010707 if (PyUnicode_READY(self) == -1)
10708 return NULL;
10709 if (PyUnicode_GET_LENGTH(self) == 0)
10710 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010711 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712}
10713
INADA Naoki3ae20562017-01-16 20:41:20 +090010714/*[clinic input]
10715str.casefold as unicode_casefold
10716
10717Return a version of the string suitable for caseless comparisons.
10718[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010719
10720static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010721unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010722/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010723{
10724 if (PyUnicode_READY(self) == -1)
10725 return NULL;
10726 if (PyUnicode_IS_ASCII(self))
10727 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010728 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010729}
10730
10731
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010732/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010733
10734static int
10735convert_uc(PyObject *obj, void *addr)
10736{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010738
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010739 if (!PyUnicode_Check(obj)) {
10740 PyErr_Format(PyExc_TypeError,
10741 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010742 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010743 return 0;
10744 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010745 if (PyUnicode_READY(obj) < 0)
10746 return 0;
10747 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010748 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010749 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010750 return 0;
10751 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010752 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010753 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010754}
10755
INADA Naoki3ae20562017-01-16 20:41:20 +090010756/*[clinic input]
10757str.center as unicode_center
10758
10759 width: Py_ssize_t
10760 fillchar: Py_UCS4 = ' '
10761 /
10762
10763Return a centered string of length width.
10764
10765Padding is done using the specified fill character (default is a space).
10766[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010767
10768static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010769unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10770/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010771{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010772 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010773
Benjamin Petersonbac79492012-01-14 13:34:47 -050010774 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010775 return NULL;
10776
Victor Stinnerc4b49542011-12-11 22:44:26 +010010777 if (PyUnicode_GET_LENGTH(self) >= width)
10778 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010779
Victor Stinnerc4b49542011-12-11 22:44:26 +010010780 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781 left = marg / 2 + (marg & width & 1);
10782
Victor Stinner9310abb2011-10-05 00:59:23 +020010783 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010784}
10785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010786/* This function assumes that str1 and str2 are readied by the caller. */
10787
Marc-André Lemburge5034372000-08-08 08:04:29 +000010788static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010789unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010790{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010791#define COMPARE(TYPE1, TYPE2) \
10792 do { \
10793 TYPE1* p1 = (TYPE1 *)data1; \
10794 TYPE2* p2 = (TYPE2 *)data2; \
10795 TYPE1* end = p1 + len; \
10796 Py_UCS4 c1, c2; \
10797 for (; p1 != end; p1++, p2++) { \
10798 c1 = *p1; \
10799 c2 = *p2; \
10800 if (c1 != c2) \
10801 return (c1 < c2) ? -1 : 1; \
10802 } \
10803 } \
10804 while (0)
10805
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010806 int kind1, kind2;
10807 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010808 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010810 kind1 = PyUnicode_KIND(str1);
10811 kind2 = PyUnicode_KIND(str2);
10812 data1 = PyUnicode_DATA(str1);
10813 data2 = PyUnicode_DATA(str2);
10814 len1 = PyUnicode_GET_LENGTH(str1);
10815 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010816 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010817
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010818 switch(kind1) {
10819 case PyUnicode_1BYTE_KIND:
10820 {
10821 switch(kind2) {
10822 case PyUnicode_1BYTE_KIND:
10823 {
10824 int cmp = memcmp(data1, data2, len);
10825 /* normalize result of memcmp() into the range [-1; 1] */
10826 if (cmp < 0)
10827 return -1;
10828 if (cmp > 0)
10829 return 1;
10830 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010831 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010832 case PyUnicode_2BYTE_KIND:
10833 COMPARE(Py_UCS1, Py_UCS2);
10834 break;
10835 case PyUnicode_4BYTE_KIND:
10836 COMPARE(Py_UCS1, Py_UCS4);
10837 break;
10838 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010839 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010840 }
10841 break;
10842 }
10843 case PyUnicode_2BYTE_KIND:
10844 {
10845 switch(kind2) {
10846 case PyUnicode_1BYTE_KIND:
10847 COMPARE(Py_UCS2, Py_UCS1);
10848 break;
10849 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010850 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010851 COMPARE(Py_UCS2, Py_UCS2);
10852 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010853 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010854 case PyUnicode_4BYTE_KIND:
10855 COMPARE(Py_UCS2, Py_UCS4);
10856 break;
10857 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010858 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010859 }
10860 break;
10861 }
10862 case PyUnicode_4BYTE_KIND:
10863 {
10864 switch(kind2) {
10865 case PyUnicode_1BYTE_KIND:
10866 COMPARE(Py_UCS4, Py_UCS1);
10867 break;
10868 case PyUnicode_2BYTE_KIND:
10869 COMPARE(Py_UCS4, Py_UCS2);
10870 break;
10871 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010872 {
10873#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10874 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10875 /* normalize result of wmemcmp() into the range [-1; 1] */
10876 if (cmp < 0)
10877 return -1;
10878 if (cmp > 0)
10879 return 1;
10880#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010881 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010882#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010883 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010884 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010885 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010886 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010887 }
10888 break;
10889 }
10890 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010891 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000010892 }
10893
Victor Stinner770e19e2012-10-04 22:59:45 +020010894 if (len1 == len2)
10895 return 0;
10896 if (len1 < len2)
10897 return -1;
10898 else
10899 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010900
10901#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010902}
10903
Benjamin Peterson621b4302016-09-09 13:54:34 -070010904static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010905unicode_compare_eq(PyObject *str1, PyObject *str2)
10906{
10907 int kind;
10908 void *data1, *data2;
10909 Py_ssize_t len;
10910 int cmp;
10911
Victor Stinnere5567ad2012-10-23 02:48:49 +020010912 len = PyUnicode_GET_LENGTH(str1);
10913 if (PyUnicode_GET_LENGTH(str2) != len)
10914 return 0;
10915 kind = PyUnicode_KIND(str1);
10916 if (PyUnicode_KIND(str2) != kind)
10917 return 0;
10918 data1 = PyUnicode_DATA(str1);
10919 data2 = PyUnicode_DATA(str2);
10920
10921 cmp = memcmp(data1, data2, len * kind);
10922 return (cmp == 0);
10923}
10924
10925
Alexander Belopolsky40018472011-02-26 01:02:56 +000010926int
10927PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010928{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010929 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10930 if (PyUnicode_READY(left) == -1 ||
10931 PyUnicode_READY(right) == -1)
10932 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010933
10934 /* a string is equal to itself */
10935 if (left == right)
10936 return 0;
10937
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010938 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010939 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010940 PyErr_Format(PyExc_TypeError,
10941 "Can't compare %.100s and %.100s",
10942 left->ob_type->tp_name,
10943 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944 return -1;
10945}
10946
Martin v. Löwis5b222132007-06-10 09:51:05 +000010947int
10948PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10949{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010950 Py_ssize_t i;
10951 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010952 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010953 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010954
Victor Stinner910337b2011-10-03 03:20:16 +020010955 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010956 if (!PyUnicode_IS_READY(uni)) {
10957 const wchar_t *ws = _PyUnicode_WSTR(uni);
10958 /* Compare Unicode string and source character set string */
10959 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
10960 if (chr != ustr[i])
10961 return (chr < ustr[i]) ? -1 : 1;
10962 }
10963 /* This check keeps Python strings that end in '\0' from comparing equal
10964 to C strings identical up to that point. */
10965 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
10966 return 1; /* uni is longer */
10967 if (ustr[i])
10968 return -1; /* str is longer */
10969 return 0;
10970 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010971 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010972 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010973 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010974 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010975 size_t len, len2 = strlen(str);
10976 int cmp;
10977
10978 len = Py_MIN(len1, len2);
10979 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010980 if (cmp != 0) {
10981 if (cmp < 0)
10982 return -1;
10983 else
10984 return 1;
10985 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010986 if (len1 > len2)
10987 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010988 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010010989 return -1; /* str is longer */
10990 return 0;
10991 }
10992 else {
10993 void *data = PyUnicode_DATA(uni);
10994 /* Compare Unicode string and source character set string */
10995 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020010996 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010010997 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10998 /* This check keeps Python strings that end in '\0' from comparing equal
10999 to C strings identical up to that point. */
11000 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11001 return 1; /* uni is longer */
11002 if (str[i])
11003 return -1; /* str is longer */
11004 return 0;
11005 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011006}
11007
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011008static int
11009non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11010{
11011 size_t i, len;
11012 const wchar_t *p;
11013 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11014 if (strlen(str) != len)
11015 return 0;
11016 p = _PyUnicode_WSTR(unicode);
11017 assert(p);
11018 for (i = 0; i < len; i++) {
11019 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011020 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011021 return 0;
11022 }
11023 return 1;
11024}
11025
11026int
11027_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11028{
11029 size_t len;
11030 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011031 assert(str);
11032#ifndef NDEBUG
11033 for (const char *p = str; *p; p++) {
11034 assert((unsigned char)*p < 128);
11035 }
11036#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011037 if (PyUnicode_READY(unicode) == -1) {
11038 /* Memory error or bad data */
11039 PyErr_Clear();
11040 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11041 }
11042 if (!PyUnicode_IS_ASCII(unicode))
11043 return 0;
11044 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11045 return strlen(str) == len &&
11046 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11047}
11048
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011049int
11050_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11051{
11052 PyObject *right_uni;
11053 Py_hash_t hash;
11054
11055 assert(_PyUnicode_CHECK(left));
11056 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011057#ifndef NDEBUG
11058 for (const char *p = right->string; *p; p++) {
11059 assert((unsigned char)*p < 128);
11060 }
11061#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011062
11063 if (PyUnicode_READY(left) == -1) {
11064 /* memory error or bad data */
11065 PyErr_Clear();
11066 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11067 }
11068
11069 if (!PyUnicode_IS_ASCII(left))
11070 return 0;
11071
11072 right_uni = _PyUnicode_FromId(right); /* borrowed */
11073 if (right_uni == NULL) {
11074 /* memory error or bad data */
11075 PyErr_Clear();
11076 return _PyUnicode_EqualToASCIIString(left, right->string);
11077 }
11078
11079 if (left == right_uni)
11080 return 1;
11081
11082 if (PyUnicode_CHECK_INTERNED(left))
11083 return 0;
11084
INADA Naoki7cc95f52018-01-28 02:07:09 +090011085 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011086 hash = _PyUnicode_HASH(left);
11087 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11088 return 0;
11089
11090 return unicode_compare_eq(left, right_uni);
11091}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011092
Alexander Belopolsky40018472011-02-26 01:02:56 +000011093PyObject *
11094PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011095{
11096 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011097
Victor Stinnere5567ad2012-10-23 02:48:49 +020011098 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11099 Py_RETURN_NOTIMPLEMENTED;
11100
11101 if (PyUnicode_READY(left) == -1 ||
11102 PyUnicode_READY(right) == -1)
11103 return NULL;
11104
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011105 if (left == right) {
11106 switch (op) {
11107 case Py_EQ:
11108 case Py_LE:
11109 case Py_GE:
11110 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011111 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011112 case Py_NE:
11113 case Py_LT:
11114 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011115 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011116 default:
11117 PyErr_BadArgument();
11118 return NULL;
11119 }
11120 }
11121 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011122 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011123 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011124 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011125 }
11126 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011127 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011128 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011129 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011130}
11131
Alexander Belopolsky40018472011-02-26 01:02:56 +000011132int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011133_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11134{
11135 return unicode_eq(aa, bb);
11136}
11137
11138int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011139PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011140{
Victor Stinner77282cb2013-04-14 19:22:47 +020011141 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011142 void *buf1, *buf2;
11143 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011144 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011145
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011146 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011147 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011148 "'in <string>' requires string as left operand, not %.100s",
11149 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011150 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011151 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011152 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011153 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011154 if (ensure_unicode(str) < 0)
11155 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011156
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011157 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011158 kind2 = PyUnicode_KIND(substr);
11159 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011160 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011161 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011162 len2 = PyUnicode_GET_LENGTH(substr);
11163 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011164 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011165 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011166 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011167 if (len2 == 1) {
11168 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11169 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011170 return result;
11171 }
11172 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011173 buf2 = _PyUnicode_AsKind(substr, kind1);
11174 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011175 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011176 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011177
Victor Stinner77282cb2013-04-14 19:22:47 +020011178 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011179 case PyUnicode_1BYTE_KIND:
11180 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11181 break;
11182 case PyUnicode_2BYTE_KIND:
11183 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11184 break;
11185 case PyUnicode_4BYTE_KIND:
11186 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11187 break;
11188 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011189 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011190 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011191
Victor Stinner77282cb2013-04-14 19:22:47 +020011192 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011193 PyMem_Free(buf2);
11194
Guido van Rossum403d68b2000-03-13 15:55:09 +000011195 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011196}
11197
Guido van Rossumd57fd912000-03-10 22:53:23 +000011198/* Concat to string or Unicode object giving a new Unicode object. */
11199
Alexander Belopolsky40018472011-02-26 01:02:56 +000011200PyObject *
11201PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011202{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011203 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011204 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011205 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011207 if (ensure_unicode(left) < 0)
11208 return NULL;
11209
11210 if (!PyUnicode_Check(right)) {
11211 PyErr_Format(PyExc_TypeError,
11212 "can only concatenate str (not \"%.200s\") to str",
11213 right->ob_type->tp_name);
11214 return NULL;
11215 }
11216 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011217 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011218
11219 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011220 if (left == unicode_empty)
11221 return PyUnicode_FromObject(right);
11222 if (right == unicode_empty)
11223 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011224
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011225 left_len = PyUnicode_GET_LENGTH(left);
11226 right_len = PyUnicode_GET_LENGTH(right);
11227 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011228 PyErr_SetString(PyExc_OverflowError,
11229 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011230 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011231 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011232 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011233
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011234 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11235 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011236 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011237
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011239 result = PyUnicode_New(new_len, maxchar);
11240 if (result == NULL)
11241 return NULL;
11242 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11243 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11244 assert(_PyUnicode_CheckConsistency(result, 1));
11245 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011246}
11247
Walter Dörwald1ab83302007-05-18 17:15:44 +000011248void
Victor Stinner23e56682011-10-03 03:54:37 +020011249PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011250{
Victor Stinner23e56682011-10-03 03:54:37 +020011251 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011252 Py_UCS4 maxchar, maxchar2;
11253 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011254
11255 if (p_left == NULL) {
11256 if (!PyErr_Occurred())
11257 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011258 return;
11259 }
Victor Stinner23e56682011-10-03 03:54:37 +020011260 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011261 if (right == NULL || left == NULL
11262 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011263 if (!PyErr_Occurred())
11264 PyErr_BadInternalCall();
11265 goto error;
11266 }
11267
Benjamin Petersonbac79492012-01-14 13:34:47 -050011268 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011269 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011270 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011271 goto error;
11272
Victor Stinner488fa492011-12-12 00:01:39 +010011273 /* Shortcuts */
11274 if (left == unicode_empty) {
11275 Py_DECREF(left);
11276 Py_INCREF(right);
11277 *p_left = right;
11278 return;
11279 }
11280 if (right == unicode_empty)
11281 return;
11282
11283 left_len = PyUnicode_GET_LENGTH(left);
11284 right_len = PyUnicode_GET_LENGTH(right);
11285 if (left_len > PY_SSIZE_T_MAX - right_len) {
11286 PyErr_SetString(PyExc_OverflowError,
11287 "strings are too large to concat");
11288 goto error;
11289 }
11290 new_len = left_len + right_len;
11291
11292 if (unicode_modifiable(left)
11293 && PyUnicode_CheckExact(right)
11294 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011295 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11296 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011297 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011298 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011299 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11300 {
11301 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011302 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011303 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011304
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011305 /* copy 'right' into the newly allocated area of 'left' */
11306 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011307 }
Victor Stinner488fa492011-12-12 00:01:39 +010011308 else {
11309 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11310 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011311 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011312
Victor Stinner488fa492011-12-12 00:01:39 +010011313 /* Concat the two Unicode strings */
11314 res = PyUnicode_New(new_len, maxchar);
11315 if (res == NULL)
11316 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011317 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11318 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011319 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011320 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011321 }
11322 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011323 return;
11324
11325error:
Victor Stinner488fa492011-12-12 00:01:39 +010011326 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011327}
11328
11329void
11330PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11331{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011332 PyUnicode_Append(pleft, right);
11333 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011334}
11335
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011336/*
11337Wraps stringlib_parse_args_finds() and additionally ensures that the
11338first argument is a unicode object.
11339*/
11340
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011341static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011342parse_args_finds_unicode(const char * function_name, PyObject *args,
11343 PyObject **substring,
11344 Py_ssize_t *start, Py_ssize_t *end)
11345{
11346 if(stringlib_parse_args_finds(function_name, args, substring,
11347 start, end)) {
11348 if (ensure_unicode(*substring) < 0)
11349 return 0;
11350 return 1;
11351 }
11352 return 0;
11353}
11354
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011355PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011356 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011358Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011359string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011360interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361
11362static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011363unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011364{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011365 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011366 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011367 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011369 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011370 void *buf1, *buf2;
11371 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011373 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011374 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011375
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011376 kind1 = PyUnicode_KIND(self);
11377 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011378 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011379 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011381 len1 = PyUnicode_GET_LENGTH(self);
11382 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011383 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011384 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011385 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011386
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011387 buf1 = PyUnicode_DATA(self);
11388 buf2 = PyUnicode_DATA(substring);
11389 if (kind2 != kind1) {
11390 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011391 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011392 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011393 }
11394 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011395 case PyUnicode_1BYTE_KIND:
11396 iresult = ucs1lib_count(
11397 ((Py_UCS1*)buf1) + start, end - start,
11398 buf2, len2, PY_SSIZE_T_MAX
11399 );
11400 break;
11401 case PyUnicode_2BYTE_KIND:
11402 iresult = ucs2lib_count(
11403 ((Py_UCS2*)buf1) + start, end - start,
11404 buf2, len2, PY_SSIZE_T_MAX
11405 );
11406 break;
11407 case PyUnicode_4BYTE_KIND:
11408 iresult = ucs4lib_count(
11409 ((Py_UCS4*)buf1) + start, end - start,
11410 buf2, len2, PY_SSIZE_T_MAX
11411 );
11412 break;
11413 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011414 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011415 }
11416
11417 result = PyLong_FromSsize_t(iresult);
11418
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011419 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011420 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422 return result;
11423}
11424
INADA Naoki3ae20562017-01-16 20:41:20 +090011425/*[clinic input]
11426str.encode as unicode_encode
11427
11428 encoding: str(c_default="NULL") = 'utf-8'
11429 The encoding in which to encode the string.
11430 errors: str(c_default="NULL") = 'strict'
11431 The error handling scheme to use for encoding errors.
11432 The default is 'strict' meaning that encoding errors raise a
11433 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11434 'xmlcharrefreplace' as well as any other name registered with
11435 codecs.register_error that can handle UnicodeEncodeErrors.
11436
11437Encode the string using the codec registered for encoding.
11438[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439
11440static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011441unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011442/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011444 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011445}
11446
INADA Naoki3ae20562017-01-16 20:41:20 +090011447/*[clinic input]
11448str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449
INADA Naoki3ae20562017-01-16 20:41:20 +090011450 tabsize: int = 8
11451
11452Return a copy where all tab characters are expanded using spaces.
11453
11454If tabsize is not given, a tab size of 8 characters is assumed.
11455[clinic start generated code]*/
11456
11457static PyObject *
11458unicode_expandtabs_impl(PyObject *self, int tabsize)
11459/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011461 Py_ssize_t i, j, line_pos, src_len, incr;
11462 Py_UCS4 ch;
11463 PyObject *u;
11464 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011465 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011466 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467
Antoine Pitrou22425222011-10-04 19:10:51 +020011468 if (PyUnicode_READY(self) == -1)
11469 return NULL;
11470
Thomas Wouters7e474022000-07-16 12:04:32 +000011471 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011472 src_len = PyUnicode_GET_LENGTH(self);
11473 i = j = line_pos = 0;
11474 kind = PyUnicode_KIND(self);
11475 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011476 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011477 for (; i < src_len; i++) {
11478 ch = PyUnicode_READ(kind, src_data, i);
11479 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011480 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011481 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011482 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011483 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011484 goto overflow;
11485 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011486 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011487 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011488 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011489 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011490 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011491 goto overflow;
11492 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011494 if (ch == '\n' || ch == '\r')
11495 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011497 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011498 if (!found)
11499 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011500
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011502 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503 if (!u)
11504 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011505 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506
Antoine Pitroue71d5742011-10-04 15:55:09 +020011507 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508
Antoine Pitroue71d5742011-10-04 15:55:09 +020011509 for (; i < src_len; i++) {
11510 ch = PyUnicode_READ(kind, src_data, i);
11511 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011512 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011513 incr = tabsize - (line_pos % tabsize);
11514 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011515 FILL(kind, dest_data, ' ', j, incr);
11516 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011517 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011518 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011519 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011520 line_pos++;
11521 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011522 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011523 if (ch == '\n' || ch == '\r')
11524 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011525 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011526 }
11527 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011528 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011529
Antoine Pitroue71d5742011-10-04 15:55:09 +020011530 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011531 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11532 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533}
11534
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011535PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011536 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537\n\
11538Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011539such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540arguments start and end are interpreted as in slice notation.\n\
11541\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011542Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011543
11544static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011545unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011547 /* initialize variables to prevent gcc warning */
11548 PyObject *substring = NULL;
11549 Py_ssize_t start = 0;
11550 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011551 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011553 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011556 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011557 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011558
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011559 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011561 if (result == -2)
11562 return NULL;
11563
Christian Heimes217cfd12007-12-02 14:31:20 +000011564 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565}
11566
11567static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011568unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011569{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011570 void *data;
11571 enum PyUnicode_Kind kind;
11572 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011573
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011574 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011575 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011576 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011577 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011578 if (PyUnicode_READY(self) == -1) {
11579 return NULL;
11580 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011581 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11582 PyErr_SetString(PyExc_IndexError, "string index out of range");
11583 return NULL;
11584 }
11585 kind = PyUnicode_KIND(self);
11586 data = PyUnicode_DATA(self);
11587 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011588 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011589}
11590
Guido van Rossumc2504932007-09-18 19:42:40 +000011591/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011592 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011593static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011594unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595{
Guido van Rossumc2504932007-09-18 19:42:40 +000011596 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011597 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011598
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011599#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011600 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011601#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011602 if (_PyUnicode_HASH(self) != -1)
11603 return _PyUnicode_HASH(self);
11604 if (PyUnicode_READY(self) == -1)
11605 return -1;
11606 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011607 /*
11608 We make the hash of the empty string be 0, rather than using
11609 (prefix ^ suffix), since this slightly obfuscates the hash secret
11610 */
11611 if (len == 0) {
11612 _PyUnicode_HASH(self) = 0;
11613 return 0;
11614 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011615 x = _Py_HashBytes(PyUnicode_DATA(self),
11616 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011617 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011618 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619}
11620
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011621PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011622 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623\n\
oldkaa0735f2018-02-02 16:52:55 +080011624Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011625such that sub is contained within S[start:end]. Optional\n\
11626arguments start and end are interpreted as in slice notation.\n\
11627\n\
11628Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629
11630static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011631unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011633 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011634 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011635 PyObject *substring = NULL;
11636 Py_ssize_t start = 0;
11637 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011638
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011639 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011641
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011642 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011643 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011644
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011645 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011647 if (result == -2)
11648 return NULL;
11649
Guido van Rossumd57fd912000-03-10 22:53:23 +000011650 if (result < 0) {
11651 PyErr_SetString(PyExc_ValueError, "substring not found");
11652 return NULL;
11653 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011654
Christian Heimes217cfd12007-12-02 14:31:20 +000011655 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011656}
11657
INADA Naoki3ae20562017-01-16 20:41:20 +090011658/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011659str.isascii as unicode_isascii
11660
11661Return True if all characters in the string are ASCII, False otherwise.
11662
11663ASCII characters have code points in the range U+0000-U+007F.
11664Empty string is ASCII too.
11665[clinic start generated code]*/
11666
11667static PyObject *
11668unicode_isascii_impl(PyObject *self)
11669/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11670{
11671 if (PyUnicode_READY(self) == -1) {
11672 return NULL;
11673 }
11674 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11675}
11676
11677/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011678str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011679
INADA Naoki3ae20562017-01-16 20:41:20 +090011680Return True if the string is a lowercase string, False otherwise.
11681
11682A string is lowercase if all cased characters in the string are lowercase and
11683there is at least one cased character in the string.
11684[clinic start generated code]*/
11685
11686static PyObject *
11687unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011688/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011690 Py_ssize_t i, length;
11691 int kind;
11692 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011693 int cased;
11694
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011695 if (PyUnicode_READY(self) == -1)
11696 return NULL;
11697 length = PyUnicode_GET_LENGTH(self);
11698 kind = PyUnicode_KIND(self);
11699 data = PyUnicode_DATA(self);
11700
Guido van Rossumd57fd912000-03-10 22:53:23 +000011701 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011702 if (length == 1)
11703 return PyBool_FromLong(
11704 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011705
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011706 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011707 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011708 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011709
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011711 for (i = 0; i < length; i++) {
11712 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011713
Benjamin Peterson29060642009-01-31 22:14:21 +000011714 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011715 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011716 else if (!cased && Py_UNICODE_ISLOWER(ch))
11717 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011718 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011719 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720}
11721
INADA Naoki3ae20562017-01-16 20:41:20 +090011722/*[clinic input]
11723str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724
INADA Naoki3ae20562017-01-16 20:41:20 +090011725Return True if the string is an uppercase string, False otherwise.
11726
11727A string is uppercase if all cased characters in the string are uppercase and
11728there is at least one cased character in the string.
11729[clinic start generated code]*/
11730
11731static PyObject *
11732unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011733/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011735 Py_ssize_t i, length;
11736 int kind;
11737 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738 int cased;
11739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011740 if (PyUnicode_READY(self) == -1)
11741 return NULL;
11742 length = PyUnicode_GET_LENGTH(self);
11743 kind = PyUnicode_KIND(self);
11744 data = PyUnicode_DATA(self);
11745
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011747 if (length == 1)
11748 return PyBool_FromLong(
11749 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011751 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011752 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011753 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011754
Guido van Rossumd57fd912000-03-10 22:53:23 +000011755 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011756 for (i = 0; i < length; i++) {
11757 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011758
Benjamin Peterson29060642009-01-31 22:14:21 +000011759 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011760 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011761 else if (!cased && Py_UNICODE_ISUPPER(ch))
11762 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011764 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765}
11766
INADA Naoki3ae20562017-01-16 20:41:20 +090011767/*[clinic input]
11768str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769
INADA Naoki3ae20562017-01-16 20:41:20 +090011770Return True if the string is a title-cased string, False otherwise.
11771
11772In a title-cased string, upper- and title-case characters may only
11773follow uncased characters and lowercase characters only cased ones.
11774[clinic start generated code]*/
11775
11776static PyObject *
11777unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011778/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011780 Py_ssize_t i, length;
11781 int kind;
11782 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783 int cased, previous_is_cased;
11784
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 if (PyUnicode_READY(self) == -1)
11786 return NULL;
11787 length = PyUnicode_GET_LENGTH(self);
11788 kind = PyUnicode_KIND(self);
11789 data = PyUnicode_DATA(self);
11790
Guido van Rossumd57fd912000-03-10 22:53:23 +000011791 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011792 if (length == 1) {
11793 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11794 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11795 (Py_UNICODE_ISUPPER(ch) != 0));
11796 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011797
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011798 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011799 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011800 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011801
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802 cased = 0;
11803 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011804 for (i = 0; i < length; i++) {
11805 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011806
Benjamin Peterson29060642009-01-31 22:14:21 +000011807 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11808 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011809 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011810 previous_is_cased = 1;
11811 cased = 1;
11812 }
11813 else if (Py_UNICODE_ISLOWER(ch)) {
11814 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011815 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011816 previous_is_cased = 1;
11817 cased = 1;
11818 }
11819 else
11820 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011822 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823}
11824
INADA Naoki3ae20562017-01-16 20:41:20 +090011825/*[clinic input]
11826str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827
INADA Naoki3ae20562017-01-16 20:41:20 +090011828Return True if the string is a whitespace string, False otherwise.
11829
11830A string is whitespace if all characters in the string are whitespace and there
11831is at least one character in the string.
11832[clinic start generated code]*/
11833
11834static PyObject *
11835unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011836/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011838 Py_ssize_t i, length;
11839 int kind;
11840 void *data;
11841
11842 if (PyUnicode_READY(self) == -1)
11843 return NULL;
11844 length = PyUnicode_GET_LENGTH(self);
11845 kind = PyUnicode_KIND(self);
11846 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847
Guido van Rossumd57fd912000-03-10 22:53:23 +000011848 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011849 if (length == 1)
11850 return PyBool_FromLong(
11851 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011853 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011854 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011855 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011856
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011857 for (i = 0; i < length; i++) {
11858 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011859 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011860 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011861 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011862 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011863}
11864
INADA Naoki3ae20562017-01-16 20:41:20 +090011865/*[clinic input]
11866str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011867
INADA Naoki3ae20562017-01-16 20:41:20 +090011868Return True if the string is an alphabetic string, False otherwise.
11869
11870A string is alphabetic if all characters in the string are alphabetic and there
11871is at least one character in the string.
11872[clinic start generated code]*/
11873
11874static PyObject *
11875unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011876/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011877{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011878 Py_ssize_t i, length;
11879 int kind;
11880 void *data;
11881
11882 if (PyUnicode_READY(self) == -1)
11883 return NULL;
11884 length = PyUnicode_GET_LENGTH(self);
11885 kind = PyUnicode_KIND(self);
11886 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011887
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011888 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011889 if (length == 1)
11890 return PyBool_FromLong(
11891 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011892
11893 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011894 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011895 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011896
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011897 for (i = 0; i < length; i++) {
11898 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011899 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011900 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011901 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011902}
11903
INADA Naoki3ae20562017-01-16 20:41:20 +090011904/*[clinic input]
11905str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011906
INADA Naoki3ae20562017-01-16 20:41:20 +090011907Return True if the string is an alpha-numeric string, False otherwise.
11908
11909A string is alpha-numeric if all characters in the string are alpha-numeric and
11910there is at least one character in the string.
11911[clinic start generated code]*/
11912
11913static PyObject *
11914unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011915/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011916{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011917 int kind;
11918 void *data;
11919 Py_ssize_t len, i;
11920
11921 if (PyUnicode_READY(self) == -1)
11922 return NULL;
11923
11924 kind = PyUnicode_KIND(self);
11925 data = PyUnicode_DATA(self);
11926 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011927
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011928 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011929 if (len == 1) {
11930 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11931 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11932 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011933
11934 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011935 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011936 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011938 for (i = 0; i < len; i++) {
11939 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011940 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011941 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011942 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011943 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011944}
11945
INADA Naoki3ae20562017-01-16 20:41:20 +090011946/*[clinic input]
11947str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948
INADA Naoki3ae20562017-01-16 20:41:20 +090011949Return True if the string is a decimal string, False otherwise.
11950
11951A string is a decimal string if all characters in the string are decimal and
11952there is at least one character in the string.
11953[clinic start generated code]*/
11954
11955static PyObject *
11956unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011957/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011959 Py_ssize_t i, length;
11960 int kind;
11961 void *data;
11962
11963 if (PyUnicode_READY(self) == -1)
11964 return NULL;
11965 length = PyUnicode_GET_LENGTH(self);
11966 kind = PyUnicode_KIND(self);
11967 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968
Guido van Rossumd57fd912000-03-10 22:53:23 +000011969 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 if (length == 1)
11971 return PyBool_FromLong(
11972 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011973
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011974 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011975 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011976 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 for (i = 0; i < length; i++) {
11979 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011980 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011982 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983}
11984
INADA Naoki3ae20562017-01-16 20:41:20 +090011985/*[clinic input]
11986str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000011987
INADA Naoki3ae20562017-01-16 20:41:20 +090011988Return True if the string is a digit string, False otherwise.
11989
11990A string is a digit string if all characters in the string are digits and there
11991is at least one character in the string.
11992[clinic start generated code]*/
11993
11994static PyObject *
11995unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011996/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011998 Py_ssize_t i, length;
11999 int kind;
12000 void *data;
12001
12002 if (PyUnicode_READY(self) == -1)
12003 return NULL;
12004 length = PyUnicode_GET_LENGTH(self);
12005 kind = PyUnicode_KIND(self);
12006 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007
Guido van Rossumd57fd912000-03-10 22:53:23 +000012008 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012009 if (length == 1) {
12010 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12011 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12012 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012013
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012014 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012015 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012016 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012017
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012018 for (i = 0; i < length; i++) {
12019 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012020 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012021 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012022 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012023}
12024
INADA Naoki3ae20562017-01-16 20:41:20 +090012025/*[clinic input]
12026str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012027
INADA Naoki3ae20562017-01-16 20:41:20 +090012028Return True if the string is a numeric string, False otherwise.
12029
12030A string is numeric if all characters in the string are numeric and there is at
12031least one character in the string.
12032[clinic start generated code]*/
12033
12034static PyObject *
12035unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012036/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012037{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012038 Py_ssize_t i, length;
12039 int kind;
12040 void *data;
12041
12042 if (PyUnicode_READY(self) == -1)
12043 return NULL;
12044 length = PyUnicode_GET_LENGTH(self);
12045 kind = PyUnicode_KIND(self);
12046 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047
Guido van Rossumd57fd912000-03-10 22:53:23 +000012048 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012049 if (length == 1)
12050 return PyBool_FromLong(
12051 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012053 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012054 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012055 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012057 for (i = 0; i < length; i++) {
12058 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012059 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012060 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012061 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062}
12063
Martin v. Löwis47383402007-08-15 07:32:56 +000012064int
12065PyUnicode_IsIdentifier(PyObject *self)
12066{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012067 int kind;
12068 void *data;
12069 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012070 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012071
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012072 if (PyUnicode_READY(self) == -1) {
12073 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012074 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012075 }
12076
12077 /* Special case for empty strings */
12078 if (PyUnicode_GET_LENGTH(self) == 0)
12079 return 0;
12080 kind = PyUnicode_KIND(self);
12081 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012082
12083 /* PEP 3131 says that the first character must be in
12084 XID_Start and subsequent characters in XID_Continue,
12085 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012086 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012087 letters, digits, underscore). However, given the current
12088 definition of XID_Start and XID_Continue, it is sufficient
12089 to check just for these, except that _ must be allowed
12090 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012091 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012092 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012093 return 0;
12094
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012095 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012096 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012097 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012098 return 1;
12099}
12100
INADA Naoki3ae20562017-01-16 20:41:20 +090012101/*[clinic input]
12102str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012103
INADA Naoki3ae20562017-01-16 20:41:20 +090012104Return True if the string is a valid Python identifier, False otherwise.
12105
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012106Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012107such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012108[clinic start generated code]*/
12109
12110static PyObject *
12111unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012112/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012113{
12114 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12115}
12116
INADA Naoki3ae20562017-01-16 20:41:20 +090012117/*[clinic input]
12118str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012119
INADA Naoki3ae20562017-01-16 20:41:20 +090012120Return True if the string is printable, False otherwise.
12121
12122A string is printable if all of its characters are considered printable in
12123repr() or if it is empty.
12124[clinic start generated code]*/
12125
12126static PyObject *
12127unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012128/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012129{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012130 Py_ssize_t i, length;
12131 int kind;
12132 void *data;
12133
12134 if (PyUnicode_READY(self) == -1)
12135 return NULL;
12136 length = PyUnicode_GET_LENGTH(self);
12137 kind = PyUnicode_KIND(self);
12138 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012139
12140 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141 if (length == 1)
12142 return PyBool_FromLong(
12143 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012144
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 for (i = 0; i < length; i++) {
12146 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012147 Py_RETURN_FALSE;
12148 }
12149 }
12150 Py_RETURN_TRUE;
12151}
12152
INADA Naoki3ae20562017-01-16 20:41:20 +090012153/*[clinic input]
12154str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012155
INADA Naoki3ae20562017-01-16 20:41:20 +090012156 iterable: object
12157 /
12158
12159Concatenate any number of strings.
12160
Martin Panter91a88662017-01-24 00:30:06 +000012161The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012162The result is returned as a new string.
12163
12164Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12165[clinic start generated code]*/
12166
12167static PyObject *
12168unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012169/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012170{
INADA Naoki3ae20562017-01-16 20:41:20 +090012171 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012172}
12173
Martin v. Löwis18e16552006-02-15 17:27:45 +000012174static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012175unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012176{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012177 if (PyUnicode_READY(self) == -1)
12178 return -1;
12179 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012180}
12181
INADA Naoki3ae20562017-01-16 20:41:20 +090012182/*[clinic input]
12183str.ljust as unicode_ljust
12184
12185 width: Py_ssize_t
12186 fillchar: Py_UCS4 = ' '
12187 /
12188
12189Return a left-justified string of length width.
12190
12191Padding is done using the specified fill character (default is a space).
12192[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012193
12194static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012195unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12196/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012197{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012198 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012199 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012200
Victor Stinnerc4b49542011-12-11 22:44:26 +010012201 if (PyUnicode_GET_LENGTH(self) >= width)
12202 return unicode_result_unchanged(self);
12203
12204 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012205}
12206
INADA Naoki3ae20562017-01-16 20:41:20 +090012207/*[clinic input]
12208str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012209
INADA Naoki3ae20562017-01-16 20:41:20 +090012210Return a copy of the string converted to lowercase.
12211[clinic start generated code]*/
12212
12213static PyObject *
12214unicode_lower_impl(PyObject *self)
12215/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012216{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012217 if (PyUnicode_READY(self) == -1)
12218 return NULL;
12219 if (PyUnicode_IS_ASCII(self))
12220 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012221 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012222}
12223
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012224#define LEFTSTRIP 0
12225#define RIGHTSTRIP 1
12226#define BOTHSTRIP 2
12227
12228/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012229static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012230
INADA Naoki3ae20562017-01-16 20:41:20 +090012231#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012232
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012233/* externally visible for str.strip(unicode) */
12234PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012235_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012236{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012237 void *data;
12238 int kind;
12239 Py_ssize_t i, j, len;
12240 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012241 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012243 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12244 return NULL;
12245
12246 kind = PyUnicode_KIND(self);
12247 data = PyUnicode_DATA(self);
12248 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012249 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012250 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12251 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012252 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012253
Benjamin Peterson14339b62009-01-31 16:36:08 +000012254 i = 0;
12255 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012256 while (i < len) {
12257 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12258 if (!BLOOM(sepmask, ch))
12259 break;
12260 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12261 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012262 i++;
12263 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012264 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012265
Benjamin Peterson14339b62009-01-31 16:36:08 +000012266 j = len;
12267 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012268 j--;
12269 while (j >= i) {
12270 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12271 if (!BLOOM(sepmask, ch))
12272 break;
12273 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12274 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012275 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012276 }
12277
Benjamin Peterson29060642009-01-31 22:14:21 +000012278 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012279 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012280
Victor Stinner7931d9a2011-11-04 00:22:48 +010012281 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012282}
12283
12284PyObject*
12285PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12286{
12287 unsigned char *data;
12288 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012289 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012290
Victor Stinnerde636f32011-10-01 03:55:54 +020012291 if (PyUnicode_READY(self) == -1)
12292 return NULL;
12293
Victor Stinner684d5fd2012-05-03 02:32:34 +020012294 length = PyUnicode_GET_LENGTH(self);
12295 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012296
Victor Stinner684d5fd2012-05-03 02:32:34 +020012297 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012298 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012299
Victor Stinnerde636f32011-10-01 03:55:54 +020012300 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012301 PyErr_SetString(PyExc_IndexError, "string index out of range");
12302 return NULL;
12303 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012304 if (start >= length || end < start)
12305 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012306
Victor Stinner684d5fd2012-05-03 02:32:34 +020012307 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012308 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012309 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012310 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012311 }
12312 else {
12313 kind = PyUnicode_KIND(self);
12314 data = PyUnicode_1BYTE_DATA(self);
12315 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012316 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012317 length);
12318 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012319}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012320
12321static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012322do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012323{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012324 Py_ssize_t len, i, j;
12325
12326 if (PyUnicode_READY(self) == -1)
12327 return NULL;
12328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012329 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012330
Victor Stinnercc7af722013-04-09 22:39:24 +020012331 if (PyUnicode_IS_ASCII(self)) {
12332 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12333
12334 i = 0;
12335 if (striptype != RIGHTSTRIP) {
12336 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012337 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012338 if (!_Py_ascii_whitespace[ch])
12339 break;
12340 i++;
12341 }
12342 }
12343
12344 j = len;
12345 if (striptype != LEFTSTRIP) {
12346 j--;
12347 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012348 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012349 if (!_Py_ascii_whitespace[ch])
12350 break;
12351 j--;
12352 }
12353 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012354 }
12355 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012356 else {
12357 int kind = PyUnicode_KIND(self);
12358 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012359
Victor Stinnercc7af722013-04-09 22:39:24 +020012360 i = 0;
12361 if (striptype != RIGHTSTRIP) {
12362 while (i < len) {
12363 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12364 if (!Py_UNICODE_ISSPACE(ch))
12365 break;
12366 i++;
12367 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012368 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012369
12370 j = len;
12371 if (striptype != LEFTSTRIP) {
12372 j--;
12373 while (j >= i) {
12374 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12375 if (!Py_UNICODE_ISSPACE(ch))
12376 break;
12377 j--;
12378 }
12379 j++;
12380 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012381 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012382
Victor Stinner7931d9a2011-11-04 00:22:48 +010012383 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012384}
12385
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012386
12387static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012388do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012389{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012390 if (sep != NULL && sep != Py_None) {
12391 if (PyUnicode_Check(sep))
12392 return _PyUnicode_XStrip(self, striptype, sep);
12393 else {
12394 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012395 "%s arg must be None or str",
12396 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012397 return NULL;
12398 }
12399 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012400
Benjamin Peterson14339b62009-01-31 16:36:08 +000012401 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012402}
12403
12404
INADA Naoki3ae20562017-01-16 20:41:20 +090012405/*[clinic input]
12406str.strip as unicode_strip
12407
12408 chars: object = None
12409 /
12410
Victor Stinner0c4a8282017-01-17 02:21:47 +010012411Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012412
12413If chars is given and not None, remove characters in chars instead.
12414[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012415
12416static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012417unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012418/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012419{
INADA Naoki3ae20562017-01-16 20:41:20 +090012420 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012421}
12422
12423
INADA Naoki3ae20562017-01-16 20:41:20 +090012424/*[clinic input]
12425str.lstrip as unicode_lstrip
12426
12427 chars: object = NULL
12428 /
12429
12430Return a copy of the string with leading whitespace removed.
12431
12432If chars is given and not None, remove characters in chars instead.
12433[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012434
12435static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012436unicode_lstrip_impl(PyObject *self, PyObject *chars)
12437/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012438{
INADA Naoki3ae20562017-01-16 20:41:20 +090012439 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012440}
12441
12442
INADA Naoki3ae20562017-01-16 20:41:20 +090012443/*[clinic input]
12444str.rstrip as unicode_rstrip
12445
12446 chars: object = NULL
12447 /
12448
12449Return a copy of the string with trailing whitespace removed.
12450
12451If chars is given and not None, remove characters in chars instead.
12452[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012453
12454static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012455unicode_rstrip_impl(PyObject *self, PyObject *chars)
12456/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012457{
INADA Naoki3ae20562017-01-16 20:41:20 +090012458 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012459}
12460
12461
Guido van Rossumd57fd912000-03-10 22:53:23 +000012462static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012463unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012464{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012465 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012466 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012467
Serhiy Storchaka05997252013-01-26 12:14:02 +020012468 if (len < 1)
12469 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012470
Victor Stinnerc4b49542011-12-11 22:44:26 +010012471 /* no repeat, return original string */
12472 if (len == 1)
12473 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012474
Benjamin Petersonbac79492012-01-14 13:34:47 -050012475 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012476 return NULL;
12477
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012478 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012479 PyErr_SetString(PyExc_OverflowError,
12480 "repeated string is too long");
12481 return NULL;
12482 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012483 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012484
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012485 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012486 if (!u)
12487 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012488 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012490 if (PyUnicode_GET_LENGTH(str) == 1) {
12491 const int kind = PyUnicode_KIND(str);
12492 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012493 if (kind == PyUnicode_1BYTE_KIND) {
12494 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012495 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012496 }
12497 else if (kind == PyUnicode_2BYTE_KIND) {
12498 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012499 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012500 ucs2[n] = fill_char;
12501 } else {
12502 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12503 assert(kind == PyUnicode_4BYTE_KIND);
12504 for (n = 0; n < len; ++n)
12505 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012506 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012507 }
12508 else {
12509 /* number of characters copied this far */
12510 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012511 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012512 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012513 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012514 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012515 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012516 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012517 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012518 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012519 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012520 }
12521
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012522 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012523 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524}
12525
Alexander Belopolsky40018472011-02-26 01:02:56 +000012526PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012527PyUnicode_Replace(PyObject *str,
12528 PyObject *substr,
12529 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012530 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012532 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12533 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012534 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012535 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012536}
12537
INADA Naoki3ae20562017-01-16 20:41:20 +090012538/*[clinic input]
12539str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012540
INADA Naoki3ae20562017-01-16 20:41:20 +090012541 old: unicode
12542 new: unicode
12543 count: Py_ssize_t = -1
12544 Maximum number of occurrences to replace.
12545 -1 (the default value) means replace all occurrences.
12546 /
12547
12548Return a copy with all occurrences of substring old replaced by new.
12549
12550If the optional argument count is given, only the first count occurrences are
12551replaced.
12552[clinic start generated code]*/
12553
12554static PyObject *
12555unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12556 Py_ssize_t count)
12557/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012558{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012559 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012560 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012561 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012562}
12563
Alexander Belopolsky40018472011-02-26 01:02:56 +000012564static PyObject *
12565unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012566{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012567 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012568 Py_ssize_t isize;
12569 Py_ssize_t osize, squote, dquote, i, o;
12570 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012571 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012572 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012573
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012574 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012575 return NULL;
12576
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012577 isize = PyUnicode_GET_LENGTH(unicode);
12578 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012580 /* Compute length of output, quote characters, and
12581 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012582 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012583 max = 127;
12584 squote = dquote = 0;
12585 ikind = PyUnicode_KIND(unicode);
12586 for (i = 0; i < isize; i++) {
12587 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012588 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012589 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012590 case '\'': squote++; break;
12591 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012592 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012593 incr = 2;
12594 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012595 default:
12596 /* Fast-path ASCII */
12597 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012598 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012599 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012600 ;
12601 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012602 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012603 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012604 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012605 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012606 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012607 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012608 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012609 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012610 if (osize > PY_SSIZE_T_MAX - incr) {
12611 PyErr_SetString(PyExc_OverflowError,
12612 "string is too long to generate repr");
12613 return NULL;
12614 }
12615 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012616 }
12617
12618 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012619 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012620 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012621 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012622 if (dquote)
12623 /* Both squote and dquote present. Use squote,
12624 and escape them */
12625 osize += squote;
12626 else
12627 quote = '"';
12628 }
Victor Stinner55c08782013-04-14 18:45:39 +020012629 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012630
12631 repr = PyUnicode_New(osize, max);
12632 if (repr == NULL)
12633 return NULL;
12634 okind = PyUnicode_KIND(repr);
12635 odata = PyUnicode_DATA(repr);
12636
12637 PyUnicode_WRITE(okind, odata, 0, quote);
12638 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012639 if (unchanged) {
12640 _PyUnicode_FastCopyCharacters(repr, 1,
12641 unicode, 0,
12642 isize);
12643 }
12644 else {
12645 for (i = 0, o = 1; i < isize; i++) {
12646 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012647
Victor Stinner55c08782013-04-14 18:45:39 +020012648 /* Escape quotes and backslashes */
12649 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012650 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012651 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012652 continue;
12653 }
12654
12655 /* Map special whitespace to '\t', \n', '\r' */
12656 if (ch == '\t') {
12657 PyUnicode_WRITE(okind, odata, o++, '\\');
12658 PyUnicode_WRITE(okind, odata, o++, 't');
12659 }
12660 else if (ch == '\n') {
12661 PyUnicode_WRITE(okind, odata, o++, '\\');
12662 PyUnicode_WRITE(okind, odata, o++, 'n');
12663 }
12664 else if (ch == '\r') {
12665 PyUnicode_WRITE(okind, odata, o++, '\\');
12666 PyUnicode_WRITE(okind, odata, o++, 'r');
12667 }
12668
12669 /* Map non-printable US ASCII to '\xhh' */
12670 else if (ch < ' ' || ch == 0x7F) {
12671 PyUnicode_WRITE(okind, odata, o++, '\\');
12672 PyUnicode_WRITE(okind, odata, o++, 'x');
12673 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12674 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12675 }
12676
12677 /* Copy ASCII characters as-is */
12678 else if (ch < 0x7F) {
12679 PyUnicode_WRITE(okind, odata, o++, ch);
12680 }
12681
12682 /* Non-ASCII characters */
12683 else {
12684 /* Map Unicode whitespace and control characters
12685 (categories Z* and C* except ASCII space)
12686 */
12687 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12688 PyUnicode_WRITE(okind, odata, o++, '\\');
12689 /* Map 8-bit characters to '\xhh' */
12690 if (ch <= 0xff) {
12691 PyUnicode_WRITE(okind, odata, o++, 'x');
12692 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12693 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12694 }
12695 /* Map 16-bit characters to '\uxxxx' */
12696 else if (ch <= 0xffff) {
12697 PyUnicode_WRITE(okind, odata, o++, 'u');
12698 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12699 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12700 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12701 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12702 }
12703 /* Map 21-bit characters to '\U00xxxxxx' */
12704 else {
12705 PyUnicode_WRITE(okind, odata, o++, 'U');
12706 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12707 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12708 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12709 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12710 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12711 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12712 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12713 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12714 }
12715 }
12716 /* Copy characters as-is */
12717 else {
12718 PyUnicode_WRITE(okind, odata, o++, ch);
12719 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012720 }
12721 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012722 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012723 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012724 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012725 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012726}
12727
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012728PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012729 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012730\n\
12731Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012732such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012733arguments start and end are interpreted as in slice notation.\n\
12734\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012735Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012736
12737static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012738unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012739{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012740 /* initialize variables to prevent gcc warning */
12741 PyObject *substring = NULL;
12742 Py_ssize_t start = 0;
12743 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012744 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012745
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012746 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012747 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012748
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012749 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012750 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012751
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012752 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012754 if (result == -2)
12755 return NULL;
12756
Christian Heimes217cfd12007-12-02 14:31:20 +000012757 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012758}
12759
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012760PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012761 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012762\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012763Return the highest index in S where substring sub is found,\n\
12764such that sub is contained within S[start:end]. Optional\n\
12765arguments start and end are interpreted as in slice notation.\n\
12766\n\
12767Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012768
12769static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012770unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012771{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012772 /* initialize variables to prevent gcc warning */
12773 PyObject *substring = NULL;
12774 Py_ssize_t start = 0;
12775 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012776 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012777
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012778 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012779 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012780
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012781 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012782 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012783
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012784 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012786 if (result == -2)
12787 return NULL;
12788
Guido van Rossumd57fd912000-03-10 22:53:23 +000012789 if (result < 0) {
12790 PyErr_SetString(PyExc_ValueError, "substring not found");
12791 return NULL;
12792 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012793
Christian Heimes217cfd12007-12-02 14:31:20 +000012794 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012795}
12796
INADA Naoki3ae20562017-01-16 20:41:20 +090012797/*[clinic input]
12798str.rjust as unicode_rjust
12799
12800 width: Py_ssize_t
12801 fillchar: Py_UCS4 = ' '
12802 /
12803
12804Return a right-justified string of length width.
12805
12806Padding is done using the specified fill character (default is a space).
12807[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012808
12809static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012810unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12811/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012812{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012813 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012814 return NULL;
12815
Victor Stinnerc4b49542011-12-11 22:44:26 +010012816 if (PyUnicode_GET_LENGTH(self) >= width)
12817 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012818
Victor Stinnerc4b49542011-12-11 22:44:26 +010012819 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012820}
12821
Alexander Belopolsky40018472011-02-26 01:02:56 +000012822PyObject *
12823PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012824{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012825 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012826 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012827
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012828 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012829}
12830
INADA Naoki3ae20562017-01-16 20:41:20 +090012831/*[clinic input]
12832str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012833
INADA Naoki3ae20562017-01-16 20:41:20 +090012834 sep: object = None
12835 The delimiter according which to split the string.
12836 None (the default value) means split according to any whitespace,
12837 and discard empty strings from the result.
12838 maxsplit: Py_ssize_t = -1
12839 Maximum number of splits to do.
12840 -1 (the default value) means no limit.
12841
12842Return a list of the words in the string, using sep as the delimiter string.
12843[clinic start generated code]*/
12844
12845static PyObject *
12846unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12847/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012848{
INADA Naoki3ae20562017-01-16 20:41:20 +090012849 if (sep == Py_None)
12850 return split(self, NULL, maxsplit);
12851 if (PyUnicode_Check(sep))
12852 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012853
Victor Stinner998b8062018-09-12 00:23:25 +020012854 PyErr_Format(PyExc_TypeError,
12855 "must be str or None, not %.100s",
12856 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012857 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012858}
12859
Thomas Wouters477c8d52006-05-27 19:21:47 +000012860PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012861PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012862{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012863 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012864 int kind1, kind2;
12865 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012866 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012867
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012868 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012869 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012870
Victor Stinner14f8f022011-10-05 20:58:25 +020012871 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012872 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012873 len1 = PyUnicode_GET_LENGTH(str_obj);
12874 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012875 if (kind1 < kind2 || len1 < len2) {
12876 _Py_INCREF_UNICODE_EMPTY();
12877 if (!unicode_empty)
12878 out = NULL;
12879 else {
12880 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12881 Py_DECREF(unicode_empty);
12882 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012883 return out;
12884 }
12885 buf1 = PyUnicode_DATA(str_obj);
12886 buf2 = PyUnicode_DATA(sep_obj);
12887 if (kind2 != kind1) {
12888 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12889 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012890 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012891 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012892
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012893 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012894 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012895 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12896 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12897 else
12898 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012899 break;
12900 case PyUnicode_2BYTE_KIND:
12901 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12902 break;
12903 case PyUnicode_4BYTE_KIND:
12904 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12905 break;
12906 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012907 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012908 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012909
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012910 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012911 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012912
12913 return out;
12914}
12915
12916
12917PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012918PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012919{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012920 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012921 int kind1, kind2;
12922 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012923 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012924
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012925 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012926 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012927
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012928 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012929 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012930 len1 = PyUnicode_GET_LENGTH(str_obj);
12931 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012932 if (kind1 < kind2 || len1 < len2) {
12933 _Py_INCREF_UNICODE_EMPTY();
12934 if (!unicode_empty)
12935 out = NULL;
12936 else {
12937 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12938 Py_DECREF(unicode_empty);
12939 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012940 return out;
12941 }
12942 buf1 = PyUnicode_DATA(str_obj);
12943 buf2 = PyUnicode_DATA(sep_obj);
12944 if (kind2 != kind1) {
12945 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12946 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012947 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012948 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012949
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012950 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012951 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012952 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12953 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12954 else
12955 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012956 break;
12957 case PyUnicode_2BYTE_KIND:
12958 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12959 break;
12960 case PyUnicode_4BYTE_KIND:
12961 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12962 break;
12963 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012964 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012965 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012966
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012967 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012968 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012969
12970 return out;
12971}
12972
INADA Naoki3ae20562017-01-16 20:41:20 +090012973/*[clinic input]
12974str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000012975
INADA Naoki3ae20562017-01-16 20:41:20 +090012976 sep: object
12977 /
12978
12979Partition the string into three parts using the given separator.
12980
12981This will search for the separator in the string. If the separator is found,
12982returns a 3-tuple containing the part before the separator, the separator
12983itself, and the part after it.
12984
12985If the separator is not found, returns a 3-tuple containing the original string
12986and two empty strings.
12987[clinic start generated code]*/
12988
12989static PyObject *
12990unicode_partition(PyObject *self, PyObject *sep)
12991/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000012992{
INADA Naoki3ae20562017-01-16 20:41:20 +090012993 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012994}
12995
INADA Naoki3ae20562017-01-16 20:41:20 +090012996/*[clinic input]
12997str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000012998
INADA Naoki3ae20562017-01-16 20:41:20 +090012999Partition the string into three parts using the given separator.
13000
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013001This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013002the separator is found, returns a 3-tuple containing the part before the
13003separator, the separator itself, and the part after it.
13004
13005If the separator is not found, returns a 3-tuple containing two empty strings
13006and the original string.
13007[clinic start generated code]*/
13008
13009static PyObject *
13010unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013011/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013012{
INADA Naoki3ae20562017-01-16 20:41:20 +090013013 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013014}
13015
Alexander Belopolsky40018472011-02-26 01:02:56 +000013016PyObject *
13017PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013018{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013019 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013020 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013021
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013022 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013023}
13024
INADA Naoki3ae20562017-01-16 20:41:20 +090013025/*[clinic input]
13026str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013027
INADA Naoki3ae20562017-01-16 20:41:20 +090013028Return a list of the words in the string, using sep as the delimiter string.
13029
13030Splits are done starting at the end of the string and working to the front.
13031[clinic start generated code]*/
13032
13033static PyObject *
13034unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13035/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013036{
INADA Naoki3ae20562017-01-16 20:41:20 +090013037 if (sep == Py_None)
13038 return rsplit(self, NULL, maxsplit);
13039 if (PyUnicode_Check(sep))
13040 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013041
Victor Stinner998b8062018-09-12 00:23:25 +020013042 PyErr_Format(PyExc_TypeError,
13043 "must be str or None, not %.100s",
13044 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013045 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013046}
13047
INADA Naoki3ae20562017-01-16 20:41:20 +090013048/*[clinic input]
13049str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013050
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013051 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013052
13053Return a list of the lines in the string, breaking at line boundaries.
13054
13055Line breaks are not included in the resulting list unless keepends is given and
13056true.
13057[clinic start generated code]*/
13058
13059static PyObject *
13060unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013061/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013062{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013063 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013064}
13065
13066static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013067PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013068{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013069 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013070}
13071
INADA Naoki3ae20562017-01-16 20:41:20 +090013072/*[clinic input]
13073str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013074
INADA Naoki3ae20562017-01-16 20:41:20 +090013075Convert uppercase characters to lowercase and lowercase characters to uppercase.
13076[clinic start generated code]*/
13077
13078static PyObject *
13079unicode_swapcase_impl(PyObject *self)
13080/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013081{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013082 if (PyUnicode_READY(self) == -1)
13083 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013084 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013085}
13086
Larry Hastings61272b72014-01-07 12:41:53 -080013087/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013088
Larry Hastings31826802013-10-19 00:09:25 -070013089@staticmethod
13090str.maketrans as unicode_maketrans
13091
13092 x: object
13093
13094 y: unicode=NULL
13095
13096 z: unicode=NULL
13097
13098 /
13099
13100Return a translation table usable for str.translate().
13101
13102If there is only one argument, it must be a dictionary mapping Unicode
13103ordinals (integers) or characters to Unicode ordinals, strings or None.
13104Character keys will be then converted to ordinals.
13105If there are two arguments, they must be strings of equal length, and
13106in the resulting dictionary, each character in x will be mapped to the
13107character at the same position in y. If there is a third argument, it
13108must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013109[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013110
Larry Hastings31826802013-10-19 00:09:25 -070013111static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013112unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013113/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013114{
Georg Brandlceee0772007-11-27 23:48:05 +000013115 PyObject *new = NULL, *key, *value;
13116 Py_ssize_t i = 0;
13117 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013118
Georg Brandlceee0772007-11-27 23:48:05 +000013119 new = PyDict_New();
13120 if (!new)
13121 return NULL;
13122 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013123 int x_kind, y_kind, z_kind;
13124 void *x_data, *y_data, *z_data;
13125
Georg Brandlceee0772007-11-27 23:48:05 +000013126 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013127 if (!PyUnicode_Check(x)) {
13128 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13129 "be a string if there is a second argument");
13130 goto err;
13131 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013132 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013133 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13134 "arguments must have equal length");
13135 goto err;
13136 }
13137 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013138 x_kind = PyUnicode_KIND(x);
13139 y_kind = PyUnicode_KIND(y);
13140 x_data = PyUnicode_DATA(x);
13141 y_data = PyUnicode_DATA(y);
13142 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13143 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013144 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013145 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013146 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013147 if (!value) {
13148 Py_DECREF(key);
13149 goto err;
13150 }
Georg Brandlceee0772007-11-27 23:48:05 +000013151 res = PyDict_SetItem(new, key, value);
13152 Py_DECREF(key);
13153 Py_DECREF(value);
13154 if (res < 0)
13155 goto err;
13156 }
13157 /* create entries for deleting chars in z */
13158 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013159 z_kind = PyUnicode_KIND(z);
13160 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013161 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013162 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013163 if (!key)
13164 goto err;
13165 res = PyDict_SetItem(new, key, Py_None);
13166 Py_DECREF(key);
13167 if (res < 0)
13168 goto err;
13169 }
13170 }
13171 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013172 int kind;
13173 void *data;
13174
Georg Brandlceee0772007-11-27 23:48:05 +000013175 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013176 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013177 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13178 "to maketrans it must be a dict");
13179 goto err;
13180 }
13181 /* copy entries into the new dict, converting string keys to int keys */
13182 while (PyDict_Next(x, &i, &key, &value)) {
13183 if (PyUnicode_Check(key)) {
13184 /* convert string keys to integer keys */
13185 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013186 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013187 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13188 "table must be of length 1");
13189 goto err;
13190 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013191 kind = PyUnicode_KIND(key);
13192 data = PyUnicode_DATA(key);
13193 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013194 if (!newkey)
13195 goto err;
13196 res = PyDict_SetItem(new, newkey, value);
13197 Py_DECREF(newkey);
13198 if (res < 0)
13199 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013200 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013201 /* just keep integer keys */
13202 if (PyDict_SetItem(new, key, value) < 0)
13203 goto err;
13204 } else {
13205 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13206 "be strings or integers");
13207 goto err;
13208 }
13209 }
13210 }
13211 return new;
13212 err:
13213 Py_DECREF(new);
13214 return NULL;
13215}
13216
INADA Naoki3ae20562017-01-16 20:41:20 +090013217/*[clinic input]
13218str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013219
INADA Naoki3ae20562017-01-16 20:41:20 +090013220 table: object
13221 Translation table, which must be a mapping of Unicode ordinals to
13222 Unicode ordinals, strings, or None.
13223 /
13224
13225Replace each character in the string using the given translation table.
13226
13227The table must implement lookup/indexing via __getitem__, for instance a
13228dictionary or list. If this operation raises LookupError, the character is
13229left untouched. Characters mapped to None are deleted.
13230[clinic start generated code]*/
13231
13232static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013233unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013234/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013235{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013236 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013237}
13238
INADA Naoki3ae20562017-01-16 20:41:20 +090013239/*[clinic input]
13240str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013241
INADA Naoki3ae20562017-01-16 20:41:20 +090013242Return a copy of the string converted to uppercase.
13243[clinic start generated code]*/
13244
13245static PyObject *
13246unicode_upper_impl(PyObject *self)
13247/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013248{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013249 if (PyUnicode_READY(self) == -1)
13250 return NULL;
13251 if (PyUnicode_IS_ASCII(self))
13252 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013253 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013254}
13255
INADA Naoki3ae20562017-01-16 20:41:20 +090013256/*[clinic input]
13257str.zfill as unicode_zfill
13258
13259 width: Py_ssize_t
13260 /
13261
13262Pad a numeric string with zeros on the left, to fill a field of the given width.
13263
13264The string is never truncated.
13265[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013266
13267static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013268unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013269/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013270{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013271 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013272 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013273 int kind;
13274 void *data;
13275 Py_UCS4 chr;
13276
Benjamin Petersonbac79492012-01-14 13:34:47 -050013277 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013278 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013279
Victor Stinnerc4b49542011-12-11 22:44:26 +010013280 if (PyUnicode_GET_LENGTH(self) >= width)
13281 return unicode_result_unchanged(self);
13282
13283 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013284
13285 u = pad(self, fill, 0, '0');
13286
Walter Dörwald068325e2002-04-15 13:36:47 +000013287 if (u == NULL)
13288 return NULL;
13289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013290 kind = PyUnicode_KIND(u);
13291 data = PyUnicode_DATA(u);
13292 chr = PyUnicode_READ(kind, data, fill);
13293
13294 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013295 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013296 PyUnicode_WRITE(kind, data, 0, chr);
13297 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013298 }
13299
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013300 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013301 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013302}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013303
13304#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013305static PyObject *
13306unicode__decimal2ascii(PyObject *self)
13307{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013308 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013309}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013310#endif
13311
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013312PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013313 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013314\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013315Return True if S starts with the specified prefix, False otherwise.\n\
13316With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013317With optional end, stop comparing S at that position.\n\
13318prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013319
13320static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013321unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013322 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013323{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013324 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013325 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013326 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013327 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013328 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013329
Jesus Ceaac451502011-04-20 17:09:23 +020013330 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013331 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013332 if (PyTuple_Check(subobj)) {
13333 Py_ssize_t i;
13334 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013335 substring = PyTuple_GET_ITEM(subobj, i);
13336 if (!PyUnicode_Check(substring)) {
13337 PyErr_Format(PyExc_TypeError,
13338 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013339 "not %.100s",
13340 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013341 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013342 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013343 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013344 if (result == -1)
13345 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013346 if (result) {
13347 Py_RETURN_TRUE;
13348 }
13349 }
13350 /* nothing matched */
13351 Py_RETURN_FALSE;
13352 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013353 if (!PyUnicode_Check(subobj)) {
13354 PyErr_Format(PyExc_TypeError,
13355 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013356 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013357 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013358 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013359 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013360 if (result == -1)
13361 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013362 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013363}
13364
13365
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013366PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013367 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013368\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013369Return True if S ends with the specified suffix, False otherwise.\n\
13370With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013371With optional end, stop comparing S at that position.\n\
13372suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013373
13374static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013375unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013376 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013377{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013378 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013379 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013380 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013381 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013382 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013383
Jesus Ceaac451502011-04-20 17:09:23 +020013384 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013385 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013386 if (PyTuple_Check(subobj)) {
13387 Py_ssize_t i;
13388 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013389 substring = PyTuple_GET_ITEM(subobj, i);
13390 if (!PyUnicode_Check(substring)) {
13391 PyErr_Format(PyExc_TypeError,
13392 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013393 "not %.100s",
13394 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013395 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013396 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013397 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013398 if (result == -1)
13399 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013400 if (result) {
13401 Py_RETURN_TRUE;
13402 }
13403 }
13404 Py_RETURN_FALSE;
13405 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013406 if (!PyUnicode_Check(subobj)) {
13407 PyErr_Format(PyExc_TypeError,
13408 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013409 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013410 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013411 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013412 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013413 if (result == -1)
13414 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013415 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013416}
13417
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013418static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013419_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013420{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013421 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13422 writer->data = PyUnicode_DATA(writer->buffer);
13423
13424 if (!writer->readonly) {
13425 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013426 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013427 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013428 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013429 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13430 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13431 writer->kind = PyUnicode_WCHAR_KIND;
13432 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13433
Victor Stinner8f674cc2013-04-17 23:02:17 +020013434 /* Copy-on-write mode: set buffer size to 0 so
13435 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13436 * next write. */
13437 writer->size = 0;
13438 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013439}
13440
Victor Stinnerd3f08822012-05-29 12:57:52 +020013441void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013442_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013443{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013444 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013445
13446 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013447 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013448
13449 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13450 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13451 writer->kind = PyUnicode_WCHAR_KIND;
13452 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013453}
13454
Victor Stinnerd3f08822012-05-29 12:57:52 +020013455int
13456_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13457 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013458{
13459 Py_ssize_t newlen;
13460 PyObject *newbuffer;
13461
Victor Stinner2740e462016-09-06 16:58:36 -070013462 assert(maxchar <= MAX_UNICODE);
13463
Victor Stinnerca9381e2015-09-22 00:58:32 +020013464 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013465 assert((maxchar > writer->maxchar && length >= 0)
13466 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013467
Victor Stinner202fdca2012-05-07 12:47:02 +020013468 if (length > PY_SSIZE_T_MAX - writer->pos) {
13469 PyErr_NoMemory();
13470 return -1;
13471 }
13472 newlen = writer->pos + length;
13473
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013474 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013475
Victor Stinnerd3f08822012-05-29 12:57:52 +020013476 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013477 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013478 if (writer->overallocate
13479 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13480 /* overallocate to limit the number of realloc() */
13481 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013482 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013483 if (newlen < writer->min_length)
13484 newlen = writer->min_length;
13485
Victor Stinnerd3f08822012-05-29 12:57:52 +020013486 writer->buffer = PyUnicode_New(newlen, maxchar);
13487 if (writer->buffer == NULL)
13488 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013489 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013490 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013491 if (writer->overallocate
13492 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13493 /* overallocate to limit the number of realloc() */
13494 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013495 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013496 if (newlen < writer->min_length)
13497 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013498
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013499 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013500 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013501 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013502 newbuffer = PyUnicode_New(newlen, maxchar);
13503 if (newbuffer == NULL)
13504 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013505 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13506 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013507 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013508 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013509 }
13510 else {
13511 newbuffer = resize_compact(writer->buffer, newlen);
13512 if (newbuffer == NULL)
13513 return -1;
13514 }
13515 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013516 }
13517 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013518 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013519 newbuffer = PyUnicode_New(writer->size, maxchar);
13520 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013521 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013522 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13523 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013524 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013525 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013526 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013527 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013528
13529#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013530}
13531
Victor Stinnerca9381e2015-09-22 00:58:32 +020013532int
13533_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13534 enum PyUnicode_Kind kind)
13535{
13536 Py_UCS4 maxchar;
13537
13538 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13539 assert(writer->kind < kind);
13540
13541 switch (kind)
13542 {
13543 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13544 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13545 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13546 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013547 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013548 }
13549
13550 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13551}
13552
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013553static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013554_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013555{
Victor Stinner2740e462016-09-06 16:58:36 -070013556 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013557 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13558 return -1;
13559 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13560 writer->pos++;
13561 return 0;
13562}
13563
13564int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013565_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13566{
13567 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13568}
13569
13570int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013571_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13572{
13573 Py_UCS4 maxchar;
13574 Py_ssize_t len;
13575
13576 if (PyUnicode_READY(str) == -1)
13577 return -1;
13578 len = PyUnicode_GET_LENGTH(str);
13579 if (len == 0)
13580 return 0;
13581 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13582 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013583 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013584 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013585 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013586 Py_INCREF(str);
13587 writer->buffer = str;
13588 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013589 writer->pos += len;
13590 return 0;
13591 }
13592 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13593 return -1;
13594 }
13595 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13596 str, 0, len);
13597 writer->pos += len;
13598 return 0;
13599}
13600
Victor Stinnere215d962012-10-06 23:03:36 +020013601int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013602_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13603 Py_ssize_t start, Py_ssize_t end)
13604{
13605 Py_UCS4 maxchar;
13606 Py_ssize_t len;
13607
13608 if (PyUnicode_READY(str) == -1)
13609 return -1;
13610
13611 assert(0 <= start);
13612 assert(end <= PyUnicode_GET_LENGTH(str));
13613 assert(start <= end);
13614
13615 if (end == 0)
13616 return 0;
13617
13618 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13619 return _PyUnicodeWriter_WriteStr(writer, str);
13620
13621 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13622 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13623 else
13624 maxchar = writer->maxchar;
13625 len = end - start;
13626
13627 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13628 return -1;
13629
13630 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13631 str, start, len);
13632 writer->pos += len;
13633 return 0;
13634}
13635
13636int
Victor Stinner4a587072013-11-19 12:54:53 +010013637_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13638 const char *ascii, Py_ssize_t len)
13639{
13640 if (len == -1)
13641 len = strlen(ascii);
13642
13643 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13644
13645 if (writer->buffer == NULL && !writer->overallocate) {
13646 PyObject *str;
13647
13648 str = _PyUnicode_FromASCII(ascii, len);
13649 if (str == NULL)
13650 return -1;
13651
13652 writer->readonly = 1;
13653 writer->buffer = str;
13654 _PyUnicodeWriter_Update(writer);
13655 writer->pos += len;
13656 return 0;
13657 }
13658
13659 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13660 return -1;
13661
13662 switch (writer->kind)
13663 {
13664 case PyUnicode_1BYTE_KIND:
13665 {
13666 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13667 Py_UCS1 *data = writer->data;
13668
Christian Heimesf051e432016-09-13 20:22:02 +020013669 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013670 break;
13671 }
13672 case PyUnicode_2BYTE_KIND:
13673 {
13674 _PyUnicode_CONVERT_BYTES(
13675 Py_UCS1, Py_UCS2,
13676 ascii, ascii + len,
13677 (Py_UCS2 *)writer->data + writer->pos);
13678 break;
13679 }
13680 case PyUnicode_4BYTE_KIND:
13681 {
13682 _PyUnicode_CONVERT_BYTES(
13683 Py_UCS1, Py_UCS4,
13684 ascii, ascii + len,
13685 (Py_UCS4 *)writer->data + writer->pos);
13686 break;
13687 }
13688 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013689 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013690 }
13691
13692 writer->pos += len;
13693 return 0;
13694}
13695
13696int
13697_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13698 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013699{
13700 Py_UCS4 maxchar;
13701
13702 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13703 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13704 return -1;
13705 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13706 writer->pos += len;
13707 return 0;
13708}
13709
Victor Stinnerd3f08822012-05-29 12:57:52 +020013710PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013711_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013712{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013713 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013714
Victor Stinnerd3f08822012-05-29 12:57:52 +020013715 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013716 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013717 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013718 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013719
13720 str = writer->buffer;
13721 writer->buffer = NULL;
13722
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013723 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013724 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13725 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013726 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013727
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013728 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13729 PyObject *str2;
13730 str2 = resize_compact(str, writer->pos);
13731 if (str2 == NULL) {
13732 Py_DECREF(str);
13733 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013734 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013735 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013736 }
13737
Victor Stinner15a0bd32013-07-08 22:29:55 +020013738 assert(_PyUnicode_CheckConsistency(str, 1));
13739 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013740}
13741
Victor Stinnerd3f08822012-05-29 12:57:52 +020013742void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013743_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013744{
13745 Py_CLEAR(writer->buffer);
13746}
13747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013748#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013749
13750PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013751 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013752\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013753Return a formatted version of S, using substitutions from args and kwargs.\n\
13754The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013755
Eric Smith27bbca62010-11-04 17:06:58 +000013756PyDoc_STRVAR(format_map__doc__,
13757 "S.format_map(mapping) -> str\n\
13758\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013759Return a formatted version of S, using substitutions from mapping.\n\
13760The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013761
INADA Naoki3ae20562017-01-16 20:41:20 +090013762/*[clinic input]
13763str.__format__ as unicode___format__
13764
13765 format_spec: unicode
13766 /
13767
13768Return a formatted version of the string as described by format_spec.
13769[clinic start generated code]*/
13770
Eric Smith4a7d76d2008-05-30 18:10:19 +000013771static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013772unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013773/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013774{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013775 _PyUnicodeWriter writer;
13776 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013777
Victor Stinnerd3f08822012-05-29 12:57:52 +020013778 if (PyUnicode_READY(self) == -1)
13779 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013780 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013781 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13782 self, format_spec, 0,
13783 PyUnicode_GET_LENGTH(format_spec));
13784 if (ret == -1) {
13785 _PyUnicodeWriter_Dealloc(&writer);
13786 return NULL;
13787 }
13788 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013789}
13790
INADA Naoki3ae20562017-01-16 20:41:20 +090013791/*[clinic input]
13792str.__sizeof__ as unicode_sizeof
13793
13794Return the size of the string in memory, in bytes.
13795[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013796
13797static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013798unicode_sizeof_impl(PyObject *self)
13799/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013800{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013801 Py_ssize_t size;
13802
13803 /* If it's a compact object, account for base structure +
13804 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013805 if (PyUnicode_IS_COMPACT_ASCII(self))
13806 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13807 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013808 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013809 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013810 else {
13811 /* If it is a two-block object, account for base object, and
13812 for character block if present. */
13813 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013814 if (_PyUnicode_DATA_ANY(self))
13815 size += (PyUnicode_GET_LENGTH(self) + 1) *
13816 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013817 }
13818 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013819 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013820 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13821 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13822 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13823 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013824
13825 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013826}
13827
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013828static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013829unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013830{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013831 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013832 if (!copy)
13833 return NULL;
13834 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013835}
13836
Guido van Rossumd57fd912000-03-10 22:53:23 +000013837static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013838 UNICODE_ENCODE_METHODDEF
13839 UNICODE_REPLACE_METHODDEF
13840 UNICODE_SPLIT_METHODDEF
13841 UNICODE_RSPLIT_METHODDEF
13842 UNICODE_JOIN_METHODDEF
13843 UNICODE_CAPITALIZE_METHODDEF
13844 UNICODE_CASEFOLD_METHODDEF
13845 UNICODE_TITLE_METHODDEF
13846 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013847 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013848 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013849 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013850 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013851 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013852 UNICODE_LJUST_METHODDEF
13853 UNICODE_LOWER_METHODDEF
13854 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013855 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13856 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013857 UNICODE_RJUST_METHODDEF
13858 UNICODE_RSTRIP_METHODDEF
13859 UNICODE_RPARTITION_METHODDEF
13860 UNICODE_SPLITLINES_METHODDEF
13861 UNICODE_STRIP_METHODDEF
13862 UNICODE_SWAPCASE_METHODDEF
13863 UNICODE_TRANSLATE_METHODDEF
13864 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013865 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13866 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090013867 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013868 UNICODE_ISLOWER_METHODDEF
13869 UNICODE_ISUPPER_METHODDEF
13870 UNICODE_ISTITLE_METHODDEF
13871 UNICODE_ISSPACE_METHODDEF
13872 UNICODE_ISDECIMAL_METHODDEF
13873 UNICODE_ISDIGIT_METHODDEF
13874 UNICODE_ISNUMERIC_METHODDEF
13875 UNICODE_ISALPHA_METHODDEF
13876 UNICODE_ISALNUM_METHODDEF
13877 UNICODE_ISIDENTIFIER_METHODDEF
13878 UNICODE_ISPRINTABLE_METHODDEF
13879 UNICODE_ZFILL_METHODDEF
Eric Smith9cd1e092007-08-31 18:39:38 +000013880 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013881 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013882 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013883 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013884 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013885#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013886 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013887 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013888#endif
13889
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013890 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013891 {NULL, NULL}
13892};
13893
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013894static PyObject *
13895unicode_mod(PyObject *v, PyObject *w)
13896{
Brian Curtindfc80e32011-08-10 20:28:54 -050013897 if (!PyUnicode_Check(v))
13898 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013899 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013900}
13901
13902static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013903 0, /*nb_add*/
13904 0, /*nb_subtract*/
13905 0, /*nb_multiply*/
13906 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013907};
13908
Guido van Rossumd57fd912000-03-10 22:53:23 +000013909static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013910 (lenfunc) unicode_length, /* sq_length */
13911 PyUnicode_Concat, /* sq_concat */
13912 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13913 (ssizeargfunc) unicode_getitem, /* sq_item */
13914 0, /* sq_slice */
13915 0, /* sq_ass_item */
13916 0, /* sq_ass_slice */
13917 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013918};
13919
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013920static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013921unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013922{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013923 if (PyUnicode_READY(self) == -1)
13924 return NULL;
13925
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013926 if (PyIndex_Check(item)) {
13927 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013928 if (i == -1 && PyErr_Occurred())
13929 return NULL;
13930 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013931 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013932 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013933 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013934 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013935 PyObject *result;
13936 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013937 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013938 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013939
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013940 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013941 return NULL;
13942 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013943 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13944 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013945
13946 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013947 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013948 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013949 slicelength == PyUnicode_GET_LENGTH(self)) {
13950 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013951 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013952 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013953 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013954 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013955 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013956 src_kind = PyUnicode_KIND(self);
13957 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013958 if (!PyUnicode_IS_ASCII(self)) {
13959 kind_limit = kind_maxchar_limit(src_kind);
13960 max_char = 0;
13961 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13962 ch = PyUnicode_READ(src_kind, src_data, cur);
13963 if (ch > max_char) {
13964 max_char = ch;
13965 if (max_char >= kind_limit)
13966 break;
13967 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013968 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013969 }
Victor Stinner55c99112011-10-13 01:17:06 +020013970 else
13971 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013972 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013973 if (result == NULL)
13974 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013975 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013976 dest_data = PyUnicode_DATA(result);
13977
13978 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013979 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13980 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013981 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013982 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013983 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013984 } else {
13985 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13986 return NULL;
13987 }
13988}
13989
13990static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013991 (lenfunc)unicode_length, /* mp_length */
13992 (binaryfunc)unicode_subscript, /* mp_subscript */
13993 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013994};
13995
Guido van Rossumd57fd912000-03-10 22:53:23 +000013996
Guido van Rossumd57fd912000-03-10 22:53:23 +000013997/* Helpers for PyUnicode_Format() */
13998
Victor Stinnera47082312012-10-04 02:19:54 +020013999struct unicode_formatter_t {
14000 PyObject *args;
14001 int args_owned;
14002 Py_ssize_t arglen, argidx;
14003 PyObject *dict;
14004
14005 enum PyUnicode_Kind fmtkind;
14006 Py_ssize_t fmtcnt, fmtpos;
14007 void *fmtdata;
14008 PyObject *fmtstr;
14009
14010 _PyUnicodeWriter writer;
14011};
14012
14013struct unicode_format_arg_t {
14014 Py_UCS4 ch;
14015 int flags;
14016 Py_ssize_t width;
14017 int prec;
14018 int sign;
14019};
14020
Guido van Rossumd57fd912000-03-10 22:53:23 +000014021static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014022unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014023{
Victor Stinnera47082312012-10-04 02:19:54 +020014024 Py_ssize_t argidx = ctx->argidx;
14025
14026 if (argidx < ctx->arglen) {
14027 ctx->argidx++;
14028 if (ctx->arglen < 0)
14029 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014030 else
Victor Stinnera47082312012-10-04 02:19:54 +020014031 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014032 }
14033 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014034 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014035 return NULL;
14036}
14037
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014038/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014039
Victor Stinnera47082312012-10-04 02:19:54 +020014040/* Format a float into the writer if the writer is not NULL, or into *p_output
14041 otherwise.
14042
14043 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014044static int
Victor Stinnera47082312012-10-04 02:19:54 +020014045formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14046 PyObject **p_output,
14047 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014048{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014049 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014050 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014051 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014052 int prec;
14053 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014054
Guido van Rossumd57fd912000-03-10 22:53:23 +000014055 x = PyFloat_AsDouble(v);
14056 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014057 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014058
Victor Stinnera47082312012-10-04 02:19:54 +020014059 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014060 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014061 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014062
Victor Stinnera47082312012-10-04 02:19:54 +020014063 if (arg->flags & F_ALT)
14064 dtoa_flags = Py_DTSF_ALT;
14065 else
14066 dtoa_flags = 0;
14067 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014068 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014069 return -1;
14070 len = strlen(p);
14071 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014072 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014073 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014074 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014075 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014076 }
14077 else
14078 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014079 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014080 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014081}
14082
Victor Stinnerd0880d52012-04-27 23:40:13 +020014083/* formatlong() emulates the format codes d, u, o, x and X, and
14084 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14085 * Python's regular ints.
14086 * Return value: a new PyUnicodeObject*, or NULL if error.
14087 * The output string is of the form
14088 * "-"? ("0x" | "0X")? digit+
14089 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14090 * set in flags. The case of hex digits will be correct,
14091 * There will be at least prec digits, zero-filled on the left if
14092 * necessary to get that many.
14093 * val object to be converted
14094 * flags bitmask of format flags; only F_ALT is looked at
14095 * prec minimum number of digits; 0-fill on left if needed
14096 * type a character in [duoxX]; u acts the same as d
14097 *
14098 * CAUTION: o, x and X conversions on regular ints can never
14099 * produce a '-' sign, but can for Python's unbounded ints.
14100 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014101PyObject *
14102_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014103{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014104 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014105 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014106 Py_ssize_t i;
14107 int sign; /* 1 if '-', else 0 */
14108 int len; /* number of characters */
14109 Py_ssize_t llen;
14110 int numdigits; /* len == numnondigits + numdigits */
14111 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014112
Victor Stinnerd0880d52012-04-27 23:40:13 +020014113 /* Avoid exceeding SSIZE_T_MAX */
14114 if (prec > INT_MAX-3) {
14115 PyErr_SetString(PyExc_OverflowError,
14116 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014117 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014118 }
14119
14120 assert(PyLong_Check(val));
14121
14122 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014123 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014124 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014125 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014126 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014127 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014128 /* int and int subclasses should print numerically when a numeric */
14129 /* format code is used (see issue18780) */
14130 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014131 break;
14132 case 'o':
14133 numnondigits = 2;
14134 result = PyNumber_ToBase(val, 8);
14135 break;
14136 case 'x':
14137 case 'X':
14138 numnondigits = 2;
14139 result = PyNumber_ToBase(val, 16);
14140 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014141 }
14142 if (!result)
14143 return NULL;
14144
14145 assert(unicode_modifiable(result));
14146 assert(PyUnicode_IS_READY(result));
14147 assert(PyUnicode_IS_ASCII(result));
14148
14149 /* To modify the string in-place, there can only be one reference. */
14150 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014151 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014152 PyErr_BadInternalCall();
14153 return NULL;
14154 }
14155 buf = PyUnicode_DATA(result);
14156 llen = PyUnicode_GET_LENGTH(result);
14157 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014158 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014159 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014160 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014161 return NULL;
14162 }
14163 len = (int)llen;
14164 sign = buf[0] == '-';
14165 numnondigits += sign;
14166 numdigits = len - numnondigits;
14167 assert(numdigits > 0);
14168
14169 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014170 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014171 (type == 'o' || type == 'x' || type == 'X'))) {
14172 assert(buf[sign] == '0');
14173 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14174 buf[sign+1] == 'o');
14175 numnondigits -= 2;
14176 buf += 2;
14177 len -= 2;
14178 if (sign)
14179 buf[0] = '-';
14180 assert(len == numnondigits + numdigits);
14181 assert(numdigits > 0);
14182 }
14183
14184 /* Fill with leading zeroes to meet minimum width. */
14185 if (prec > numdigits) {
14186 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14187 numnondigits + prec);
14188 char *b1;
14189 if (!r1) {
14190 Py_DECREF(result);
14191 return NULL;
14192 }
14193 b1 = PyBytes_AS_STRING(r1);
14194 for (i = 0; i < numnondigits; ++i)
14195 *b1++ = *buf++;
14196 for (i = 0; i < prec - numdigits; i++)
14197 *b1++ = '0';
14198 for (i = 0; i < numdigits; i++)
14199 *b1++ = *buf++;
14200 *b1 = '\0';
14201 Py_DECREF(result);
14202 result = r1;
14203 buf = PyBytes_AS_STRING(result);
14204 len = numnondigits + prec;
14205 }
14206
14207 /* Fix up case for hex conversions. */
14208 if (type == 'X') {
14209 /* Need to convert all lower case letters to upper case.
14210 and need to convert 0x to 0X (and -0x to -0X). */
14211 for (i = 0; i < len; i++)
14212 if (buf[i] >= 'a' && buf[i] <= 'x')
14213 buf[i] -= 'a'-'A';
14214 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014215 if (!PyUnicode_Check(result)
14216 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014217 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014218 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014219 Py_DECREF(result);
14220 result = unicode;
14221 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014222 else if (len != PyUnicode_GET_LENGTH(result)) {
14223 if (PyUnicode_Resize(&result, len) < 0)
14224 Py_CLEAR(result);
14225 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014226 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014227}
14228
Ethan Furmandf3ed242014-01-05 06:50:30 -080014229/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014230 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014231 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014232 * -1 and raise an exception on error */
14233static int
Victor Stinnera47082312012-10-04 02:19:54 +020014234mainformatlong(PyObject *v,
14235 struct unicode_format_arg_t *arg,
14236 PyObject **p_output,
14237 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014238{
14239 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014240 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014241
14242 if (!PyNumber_Check(v))
14243 goto wrongtype;
14244
Ethan Furman9ab74802014-03-21 06:38:46 -070014245 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014246 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014247 if (type == 'o' || type == 'x' || type == 'X') {
14248 iobj = PyNumber_Index(v);
14249 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014250 if (PyErr_ExceptionMatches(PyExc_TypeError))
14251 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014252 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014253 }
14254 }
14255 else {
14256 iobj = PyNumber_Long(v);
14257 if (iobj == NULL ) {
14258 if (PyErr_ExceptionMatches(PyExc_TypeError))
14259 goto wrongtype;
14260 return -1;
14261 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014262 }
14263 assert(PyLong_Check(iobj));
14264 }
14265 else {
14266 iobj = v;
14267 Py_INCREF(iobj);
14268 }
14269
14270 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014271 && arg->width == -1 && arg->prec == -1
14272 && !(arg->flags & (F_SIGN | F_BLANK))
14273 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014274 {
14275 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014276 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014277 int base;
14278
Victor Stinnera47082312012-10-04 02:19:54 +020014279 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014280 {
14281 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014282 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014283 case 'd':
14284 case 'i':
14285 case 'u':
14286 base = 10;
14287 break;
14288 case 'o':
14289 base = 8;
14290 break;
14291 case 'x':
14292 case 'X':
14293 base = 16;
14294 break;
14295 }
14296
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014297 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14298 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014299 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014300 }
14301 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014302 return 1;
14303 }
14304
Ethan Furmanb95b5612015-01-23 20:05:18 -080014305 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014306 Py_DECREF(iobj);
14307 if (res == NULL)
14308 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014309 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014310 return 0;
14311
14312wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014313 switch(type)
14314 {
14315 case 'o':
14316 case 'x':
14317 case 'X':
14318 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014319 "%%%c format: an integer is required, "
14320 "not %.200s",
14321 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014322 break;
14323 default:
14324 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014325 "%%%c format: a number is required, "
14326 "not %.200s",
14327 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014328 break;
14329 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014330 return -1;
14331}
14332
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014333static Py_UCS4
14334formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014335{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014336 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014337 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014338 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014339 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014340 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014341 goto onError;
14342 }
14343 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014344 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014345 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014346 /* make sure number is a type of integer */
14347 if (!PyLong_Check(v)) {
14348 iobj = PyNumber_Index(v);
14349 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014350 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014351 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014352 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014353 Py_DECREF(iobj);
14354 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014355 else {
14356 x = PyLong_AsLong(v);
14357 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014358 if (x == -1 && PyErr_Occurred())
14359 goto onError;
14360
Victor Stinner8faf8212011-12-08 22:14:11 +010014361 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014362 PyErr_SetString(PyExc_OverflowError,
14363 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014364 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014365 }
14366
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014367 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014368 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014369
Benjamin Peterson29060642009-01-31 22:14:21 +000014370 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014371 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014372 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014373 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014374}
14375
Victor Stinnera47082312012-10-04 02:19:54 +020014376/* Parse options of an argument: flags, width, precision.
14377 Handle also "%(name)" syntax.
14378
14379 Return 0 if the argument has been formatted into arg->str.
14380 Return 1 if the argument has been written into ctx->writer,
14381 Raise an exception and return -1 on error. */
14382static int
14383unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14384 struct unicode_format_arg_t *arg)
14385{
14386#define FORMAT_READ(ctx) \
14387 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14388
14389 PyObject *v;
14390
Victor Stinnera47082312012-10-04 02:19:54 +020014391 if (arg->ch == '(') {
14392 /* Get argument value from a dictionary. Example: "%(name)s". */
14393 Py_ssize_t keystart;
14394 Py_ssize_t keylen;
14395 PyObject *key;
14396 int pcount = 1;
14397
14398 if (ctx->dict == NULL) {
14399 PyErr_SetString(PyExc_TypeError,
14400 "format requires a mapping");
14401 return -1;
14402 }
14403 ++ctx->fmtpos;
14404 --ctx->fmtcnt;
14405 keystart = ctx->fmtpos;
14406 /* Skip over balanced parentheses */
14407 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14408 arg->ch = FORMAT_READ(ctx);
14409 if (arg->ch == ')')
14410 --pcount;
14411 else if (arg->ch == '(')
14412 ++pcount;
14413 ctx->fmtpos++;
14414 }
14415 keylen = ctx->fmtpos - keystart - 1;
14416 if (ctx->fmtcnt < 0 || pcount > 0) {
14417 PyErr_SetString(PyExc_ValueError,
14418 "incomplete format key");
14419 return -1;
14420 }
14421 key = PyUnicode_Substring(ctx->fmtstr,
14422 keystart, keystart + keylen);
14423 if (key == NULL)
14424 return -1;
14425 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014426 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014427 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014428 }
14429 ctx->args = PyObject_GetItem(ctx->dict, key);
14430 Py_DECREF(key);
14431 if (ctx->args == NULL)
14432 return -1;
14433 ctx->args_owned = 1;
14434 ctx->arglen = -1;
14435 ctx->argidx = -2;
14436 }
14437
14438 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014439 while (--ctx->fmtcnt >= 0) {
14440 arg->ch = FORMAT_READ(ctx);
14441 ctx->fmtpos++;
14442 switch (arg->ch) {
14443 case '-': arg->flags |= F_LJUST; continue;
14444 case '+': arg->flags |= F_SIGN; continue;
14445 case ' ': arg->flags |= F_BLANK; continue;
14446 case '#': arg->flags |= F_ALT; continue;
14447 case '0': arg->flags |= F_ZERO; continue;
14448 }
14449 break;
14450 }
14451
14452 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014453 if (arg->ch == '*') {
14454 v = unicode_format_getnextarg(ctx);
14455 if (v == NULL)
14456 return -1;
14457 if (!PyLong_Check(v)) {
14458 PyErr_SetString(PyExc_TypeError,
14459 "* wants int");
14460 return -1;
14461 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014462 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014463 if (arg->width == -1 && PyErr_Occurred())
14464 return -1;
14465 if (arg->width < 0) {
14466 arg->flags |= F_LJUST;
14467 arg->width = -arg->width;
14468 }
14469 if (--ctx->fmtcnt >= 0) {
14470 arg->ch = FORMAT_READ(ctx);
14471 ctx->fmtpos++;
14472 }
14473 }
14474 else if (arg->ch >= '0' && arg->ch <= '9') {
14475 arg->width = arg->ch - '0';
14476 while (--ctx->fmtcnt >= 0) {
14477 arg->ch = FORMAT_READ(ctx);
14478 ctx->fmtpos++;
14479 if (arg->ch < '0' || arg->ch > '9')
14480 break;
14481 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14482 mixing signed and unsigned comparison. Since arg->ch is between
14483 '0' and '9', casting to int is safe. */
14484 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14485 PyErr_SetString(PyExc_ValueError,
14486 "width too big");
14487 return -1;
14488 }
14489 arg->width = arg->width*10 + (arg->ch - '0');
14490 }
14491 }
14492
14493 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014494 if (arg->ch == '.') {
14495 arg->prec = 0;
14496 if (--ctx->fmtcnt >= 0) {
14497 arg->ch = FORMAT_READ(ctx);
14498 ctx->fmtpos++;
14499 }
14500 if (arg->ch == '*') {
14501 v = unicode_format_getnextarg(ctx);
14502 if (v == NULL)
14503 return -1;
14504 if (!PyLong_Check(v)) {
14505 PyErr_SetString(PyExc_TypeError,
14506 "* wants int");
14507 return -1;
14508 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014509 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014510 if (arg->prec == -1 && PyErr_Occurred())
14511 return -1;
14512 if (arg->prec < 0)
14513 arg->prec = 0;
14514 if (--ctx->fmtcnt >= 0) {
14515 arg->ch = FORMAT_READ(ctx);
14516 ctx->fmtpos++;
14517 }
14518 }
14519 else if (arg->ch >= '0' && arg->ch <= '9') {
14520 arg->prec = arg->ch - '0';
14521 while (--ctx->fmtcnt >= 0) {
14522 arg->ch = FORMAT_READ(ctx);
14523 ctx->fmtpos++;
14524 if (arg->ch < '0' || arg->ch > '9')
14525 break;
14526 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14527 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014528 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014529 return -1;
14530 }
14531 arg->prec = arg->prec*10 + (arg->ch - '0');
14532 }
14533 }
14534 }
14535
14536 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14537 if (ctx->fmtcnt >= 0) {
14538 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14539 if (--ctx->fmtcnt >= 0) {
14540 arg->ch = FORMAT_READ(ctx);
14541 ctx->fmtpos++;
14542 }
14543 }
14544 }
14545 if (ctx->fmtcnt < 0) {
14546 PyErr_SetString(PyExc_ValueError,
14547 "incomplete format");
14548 return -1;
14549 }
14550 return 0;
14551
14552#undef FORMAT_READ
14553}
14554
14555/* Format one argument. Supported conversion specifiers:
14556
14557 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014558 - "i", "d", "u": int or float
14559 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014560 - "e", "E", "f", "F", "g", "G": float
14561 - "c": int or str (1 character)
14562
Victor Stinner8dbd4212012-12-04 09:30:24 +010014563 When possible, the output is written directly into the Unicode writer
14564 (ctx->writer). A string is created when padding is required.
14565
Victor Stinnera47082312012-10-04 02:19:54 +020014566 Return 0 if the argument has been formatted into *p_str,
14567 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014568 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014569static int
14570unicode_format_arg_format(struct unicode_formatter_t *ctx,
14571 struct unicode_format_arg_t *arg,
14572 PyObject **p_str)
14573{
14574 PyObject *v;
14575 _PyUnicodeWriter *writer = &ctx->writer;
14576
14577 if (ctx->fmtcnt == 0)
14578 ctx->writer.overallocate = 0;
14579
Victor Stinnera47082312012-10-04 02:19:54 +020014580 v = unicode_format_getnextarg(ctx);
14581 if (v == NULL)
14582 return -1;
14583
Victor Stinnera47082312012-10-04 02:19:54 +020014584
14585 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014586 case 's':
14587 case 'r':
14588 case 'a':
14589 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14590 /* Fast path */
14591 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14592 return -1;
14593 return 1;
14594 }
14595
14596 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14597 *p_str = v;
14598 Py_INCREF(*p_str);
14599 }
14600 else {
14601 if (arg->ch == 's')
14602 *p_str = PyObject_Str(v);
14603 else if (arg->ch == 'r')
14604 *p_str = PyObject_Repr(v);
14605 else
14606 *p_str = PyObject_ASCII(v);
14607 }
14608 break;
14609
14610 case 'i':
14611 case 'd':
14612 case 'u':
14613 case 'o':
14614 case 'x':
14615 case 'X':
14616 {
14617 int ret = mainformatlong(v, arg, p_str, writer);
14618 if (ret != 0)
14619 return ret;
14620 arg->sign = 1;
14621 break;
14622 }
14623
14624 case 'e':
14625 case 'E':
14626 case 'f':
14627 case 'F':
14628 case 'g':
14629 case 'G':
14630 if (arg->width == -1 && arg->prec == -1
14631 && !(arg->flags & (F_SIGN | F_BLANK)))
14632 {
14633 /* Fast path */
14634 if (formatfloat(v, arg, NULL, writer) == -1)
14635 return -1;
14636 return 1;
14637 }
14638
14639 arg->sign = 1;
14640 if (formatfloat(v, arg, p_str, NULL) == -1)
14641 return -1;
14642 break;
14643
14644 case 'c':
14645 {
14646 Py_UCS4 ch = formatchar(v);
14647 if (ch == (Py_UCS4) -1)
14648 return -1;
14649 if (arg->width == -1 && arg->prec == -1) {
14650 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014651 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014652 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014653 return 1;
14654 }
14655 *p_str = PyUnicode_FromOrdinal(ch);
14656 break;
14657 }
14658
14659 default:
14660 PyErr_Format(PyExc_ValueError,
14661 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014662 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014663 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14664 (int)arg->ch,
14665 ctx->fmtpos - 1);
14666 return -1;
14667 }
14668 if (*p_str == NULL)
14669 return -1;
14670 assert (PyUnicode_Check(*p_str));
14671 return 0;
14672}
14673
14674static int
14675unicode_format_arg_output(struct unicode_formatter_t *ctx,
14676 struct unicode_format_arg_t *arg,
14677 PyObject *str)
14678{
14679 Py_ssize_t len;
14680 enum PyUnicode_Kind kind;
14681 void *pbuf;
14682 Py_ssize_t pindex;
14683 Py_UCS4 signchar;
14684 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014685 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014686 Py_ssize_t sublen;
14687 _PyUnicodeWriter *writer = &ctx->writer;
14688 Py_UCS4 fill;
14689
14690 fill = ' ';
14691 if (arg->sign && arg->flags & F_ZERO)
14692 fill = '0';
14693
14694 if (PyUnicode_READY(str) == -1)
14695 return -1;
14696
14697 len = PyUnicode_GET_LENGTH(str);
14698 if ((arg->width == -1 || arg->width <= len)
14699 && (arg->prec == -1 || arg->prec >= len)
14700 && !(arg->flags & (F_SIGN | F_BLANK)))
14701 {
14702 /* Fast path */
14703 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14704 return -1;
14705 return 0;
14706 }
14707
14708 /* Truncate the string for "s", "r" and "a" formats
14709 if the precision is set */
14710 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14711 if (arg->prec >= 0 && len > arg->prec)
14712 len = arg->prec;
14713 }
14714
14715 /* Adjust sign and width */
14716 kind = PyUnicode_KIND(str);
14717 pbuf = PyUnicode_DATA(str);
14718 pindex = 0;
14719 signchar = '\0';
14720 if (arg->sign) {
14721 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14722 if (ch == '-' || ch == '+') {
14723 signchar = ch;
14724 len--;
14725 pindex++;
14726 }
14727 else if (arg->flags & F_SIGN)
14728 signchar = '+';
14729 else if (arg->flags & F_BLANK)
14730 signchar = ' ';
14731 else
14732 arg->sign = 0;
14733 }
14734 if (arg->width < len)
14735 arg->width = len;
14736
14737 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014738 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014739 if (!(arg->flags & F_LJUST)) {
14740 if (arg->sign) {
14741 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014742 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014743 }
14744 else {
14745 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014746 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014747 }
14748 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014749 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14750 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014751 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014752 }
14753
Victor Stinnera47082312012-10-04 02:19:54 +020014754 buflen = arg->width;
14755 if (arg->sign && len == arg->width)
14756 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014757 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014758 return -1;
14759
14760 /* Write the sign if needed */
14761 if (arg->sign) {
14762 if (fill != ' ') {
14763 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14764 writer->pos += 1;
14765 }
14766 if (arg->width > len)
14767 arg->width--;
14768 }
14769
14770 /* Write the numeric prefix for "x", "X" and "o" formats
14771 if the alternate form is used.
14772 For example, write "0x" for the "%#x" format. */
14773 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14774 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14775 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14776 if (fill != ' ') {
14777 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14778 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14779 writer->pos += 2;
14780 pindex += 2;
14781 }
14782 arg->width -= 2;
14783 if (arg->width < 0)
14784 arg->width = 0;
14785 len -= 2;
14786 }
14787
14788 /* Pad left with the fill character if needed */
14789 if (arg->width > len && !(arg->flags & F_LJUST)) {
14790 sublen = arg->width - len;
14791 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14792 writer->pos += sublen;
14793 arg->width = len;
14794 }
14795
14796 /* If padding with spaces: write sign if needed and/or numeric prefix if
14797 the alternate form is used */
14798 if (fill == ' ') {
14799 if (arg->sign) {
14800 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14801 writer->pos += 1;
14802 }
14803 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14804 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14805 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14806 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14807 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14808 writer->pos += 2;
14809 pindex += 2;
14810 }
14811 }
14812
14813 /* Write characters */
14814 if (len) {
14815 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14816 str, pindex, len);
14817 writer->pos += len;
14818 }
14819
14820 /* Pad right with the fill character if needed */
14821 if (arg->width > len) {
14822 sublen = arg->width - len;
14823 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14824 writer->pos += sublen;
14825 }
14826 return 0;
14827}
14828
14829/* Helper of PyUnicode_Format(): format one arg.
14830 Return 0 on success, raise an exception and return -1 on error. */
14831static int
14832unicode_format_arg(struct unicode_formatter_t *ctx)
14833{
14834 struct unicode_format_arg_t arg;
14835 PyObject *str;
14836 int ret;
14837
Victor Stinner8dbd4212012-12-04 09:30:24 +010014838 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014839 if (arg.ch == '%') {
14840 ctx->fmtpos++;
14841 ctx->fmtcnt--;
14842 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14843 return -1;
14844 return 0;
14845 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014846 arg.flags = 0;
14847 arg.width = -1;
14848 arg.prec = -1;
14849 arg.sign = 0;
14850 str = NULL;
14851
Victor Stinnera47082312012-10-04 02:19:54 +020014852 ret = unicode_format_arg_parse(ctx, &arg);
14853 if (ret == -1)
14854 return -1;
14855
14856 ret = unicode_format_arg_format(ctx, &arg, &str);
14857 if (ret == -1)
14858 return -1;
14859
14860 if (ret != 1) {
14861 ret = unicode_format_arg_output(ctx, &arg, str);
14862 Py_DECREF(str);
14863 if (ret == -1)
14864 return -1;
14865 }
14866
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014867 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014868 PyErr_SetString(PyExc_TypeError,
14869 "not all arguments converted during string formatting");
14870 return -1;
14871 }
14872 return 0;
14873}
14874
Alexander Belopolsky40018472011-02-26 01:02:56 +000014875PyObject *
14876PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014877{
Victor Stinnera47082312012-10-04 02:19:54 +020014878 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014879
Guido van Rossumd57fd912000-03-10 22:53:23 +000014880 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014881 PyErr_BadInternalCall();
14882 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014883 }
Victor Stinnera47082312012-10-04 02:19:54 +020014884
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014885 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014886 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014887
14888 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014889 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14890 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14891 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14892 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014893
Victor Stinner8f674cc2013-04-17 23:02:17 +020014894 _PyUnicodeWriter_Init(&ctx.writer);
14895 ctx.writer.min_length = ctx.fmtcnt + 100;
14896 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014897
Guido van Rossumd57fd912000-03-10 22:53:23 +000014898 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014899 ctx.arglen = PyTuple_Size(args);
14900 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014901 }
14902 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014903 ctx.arglen = -1;
14904 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014905 }
Victor Stinnera47082312012-10-04 02:19:54 +020014906 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014907 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014908 ctx.dict = args;
14909 else
14910 ctx.dict = NULL;
14911 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014912
Victor Stinnera47082312012-10-04 02:19:54 +020014913 while (--ctx.fmtcnt >= 0) {
14914 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014915 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014916
14917 nonfmtpos = ctx.fmtpos++;
14918 while (ctx.fmtcnt >= 0 &&
14919 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14920 ctx.fmtpos++;
14921 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014922 }
Victor Stinnera47082312012-10-04 02:19:54 +020014923 if (ctx.fmtcnt < 0) {
14924 ctx.fmtpos--;
14925 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014926 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014927
Victor Stinnercfc4c132013-04-03 01:48:39 +020014928 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14929 nonfmtpos, ctx.fmtpos) < 0)
14930 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014931 }
14932 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014933 ctx.fmtpos++;
14934 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014935 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014936 }
14937 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014938
Victor Stinnera47082312012-10-04 02:19:54 +020014939 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014940 PyErr_SetString(PyExc_TypeError,
14941 "not all arguments converted during string formatting");
14942 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014943 }
14944
Victor Stinnera47082312012-10-04 02:19:54 +020014945 if (ctx.args_owned) {
14946 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014947 }
Victor Stinnera47082312012-10-04 02:19:54 +020014948 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014949
Benjamin Peterson29060642009-01-31 22:14:21 +000014950 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014951 _PyUnicodeWriter_Dealloc(&ctx.writer);
14952 if (ctx.args_owned) {
14953 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014954 }
14955 return NULL;
14956}
14957
Jeremy Hylton938ace62002-07-17 16:30:39 +000014958static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014959unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14960
Tim Peters6d6c1a32001-08-02 04:15:00 +000014961static PyObject *
14962unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14963{
Benjamin Peterson29060642009-01-31 22:14:21 +000014964 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014965 static char *kwlist[] = {"object", "encoding", "errors", 0};
14966 char *encoding = NULL;
14967 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014968
Benjamin Peterson14339b62009-01-31 16:36:08 +000014969 if (type != &PyUnicode_Type)
14970 return unicode_subtype_new(type, args, kwds);
14971 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014972 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014973 return NULL;
14974 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014975 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014976 if (encoding == NULL && errors == NULL)
14977 return PyObject_Str(x);
14978 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014979 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014980}
14981
Guido van Rossume023fe02001-08-30 03:12:59 +000014982static PyObject *
14983unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14984{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014985 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014986 Py_ssize_t length, char_size;
14987 int share_wstr, share_utf8;
14988 unsigned int kind;
14989 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014990
Benjamin Peterson14339b62009-01-31 16:36:08 +000014991 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014992
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014993 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014994 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014995 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014996 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014997 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014998 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014999 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015000 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015001
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015002 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015003 if (self == NULL) {
15004 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015005 return NULL;
15006 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015007 kind = PyUnicode_KIND(unicode);
15008 length = PyUnicode_GET_LENGTH(unicode);
15009
15010 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015011#ifdef Py_DEBUG
15012 _PyUnicode_HASH(self) = -1;
15013#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015014 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015015#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015016 _PyUnicode_STATE(self).interned = 0;
15017 _PyUnicode_STATE(self).kind = kind;
15018 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015019 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015020 _PyUnicode_STATE(self).ready = 1;
15021 _PyUnicode_WSTR(self) = NULL;
15022 _PyUnicode_UTF8_LENGTH(self) = 0;
15023 _PyUnicode_UTF8(self) = NULL;
15024 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015025 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015026
15027 share_utf8 = 0;
15028 share_wstr = 0;
15029 if (kind == PyUnicode_1BYTE_KIND) {
15030 char_size = 1;
15031 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15032 share_utf8 = 1;
15033 }
15034 else if (kind == PyUnicode_2BYTE_KIND) {
15035 char_size = 2;
15036 if (sizeof(wchar_t) == 2)
15037 share_wstr = 1;
15038 }
15039 else {
15040 assert(kind == PyUnicode_4BYTE_KIND);
15041 char_size = 4;
15042 if (sizeof(wchar_t) == 4)
15043 share_wstr = 1;
15044 }
15045
15046 /* Ensure we won't overflow the length. */
15047 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15048 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015049 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015050 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015051 data = PyObject_MALLOC((length + 1) * char_size);
15052 if (data == NULL) {
15053 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015054 goto onError;
15055 }
15056
Victor Stinnerc3c74152011-10-02 20:39:55 +020015057 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015058 if (share_utf8) {
15059 _PyUnicode_UTF8_LENGTH(self) = length;
15060 _PyUnicode_UTF8(self) = data;
15061 }
15062 if (share_wstr) {
15063 _PyUnicode_WSTR_LENGTH(self) = length;
15064 _PyUnicode_WSTR(self) = (wchar_t *)data;
15065 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015066
Christian Heimesf051e432016-09-13 20:22:02 +020015067 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015068 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015069 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015070#ifdef Py_DEBUG
15071 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15072#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015073 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015074 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015075
15076onError:
15077 Py_DECREF(unicode);
15078 Py_DECREF(self);
15079 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015080}
15081
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015082PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015083"str(object='') -> str\n\
15084str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015085\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015086Create a new string object from the given object. If encoding or\n\
15087errors is specified, then the object must expose a data buffer\n\
15088that will be decoded using the given encoding and error handler.\n\
15089Otherwise, returns the result of object.__str__() (if defined)\n\
15090or repr(object).\n\
15091encoding defaults to sys.getdefaultencoding().\n\
15092errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015093
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015094static PyObject *unicode_iter(PyObject *seq);
15095
Guido van Rossumd57fd912000-03-10 22:53:23 +000015096PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015097 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015098 "str", /* tp_name */
15099 sizeof(PyUnicodeObject), /* tp_basicsize */
15100 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015101 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015102 (destructor)unicode_dealloc, /* tp_dealloc */
15103 0, /* tp_print */
15104 0, /* tp_getattr */
15105 0, /* tp_setattr */
15106 0, /* tp_reserved */
15107 unicode_repr, /* tp_repr */
15108 &unicode_as_number, /* tp_as_number */
15109 &unicode_as_sequence, /* tp_as_sequence */
15110 &unicode_as_mapping, /* tp_as_mapping */
15111 (hashfunc) unicode_hash, /* tp_hash*/
15112 0, /* tp_call*/
15113 (reprfunc) unicode_str, /* tp_str */
15114 PyObject_GenericGetAttr, /* tp_getattro */
15115 0, /* tp_setattro */
15116 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015117 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015118 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15119 unicode_doc, /* tp_doc */
15120 0, /* tp_traverse */
15121 0, /* tp_clear */
15122 PyUnicode_RichCompare, /* tp_richcompare */
15123 0, /* tp_weaklistoffset */
15124 unicode_iter, /* tp_iter */
15125 0, /* tp_iternext */
15126 unicode_methods, /* tp_methods */
15127 0, /* tp_members */
15128 0, /* tp_getset */
15129 &PyBaseObject_Type, /* tp_base */
15130 0, /* tp_dict */
15131 0, /* tp_descr_get */
15132 0, /* tp_descr_set */
15133 0, /* tp_dictoffset */
15134 0, /* tp_init */
15135 0, /* tp_alloc */
15136 unicode_new, /* tp_new */
15137 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015138};
15139
15140/* Initialize the Unicode implementation */
15141
Victor Stinner3a50e702011-10-18 21:21:00 +020015142int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015143{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015144 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015145 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015146 0x000A, /* LINE FEED */
15147 0x000D, /* CARRIAGE RETURN */
15148 0x001C, /* FILE SEPARATOR */
15149 0x001D, /* GROUP SEPARATOR */
15150 0x001E, /* RECORD SEPARATOR */
15151 0x0085, /* NEXT LINE */
15152 0x2028, /* LINE SEPARATOR */
15153 0x2029, /* PARAGRAPH SEPARATOR */
15154 };
15155
Fred Drakee4315f52000-05-09 19:53:39 +000015156 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015157 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015158 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015159 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015160 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015161
Guido van Rossumcacfc072002-05-24 19:01:59 +000015162 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015163 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015164
15165 /* initialize the linebreak bloom filter */
15166 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015167 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015168 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015169
Christian Heimes26532f72013-07-20 14:57:16 +020015170 if (PyType_Ready(&EncodingMapType) < 0)
15171 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015172
Benjamin Petersonc4311282012-10-30 23:21:10 -040015173 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15174 Py_FatalError("Can't initialize field name iterator type");
15175
15176 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15177 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015178
Victor Stinner3a50e702011-10-18 21:21:00 +020015179 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015180}
15181
15182/* Finalize the Unicode implementation */
15183
Christian Heimesa156e092008-02-16 07:38:31 +000015184int
15185PyUnicode_ClearFreeList(void)
15186{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015187 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015188}
15189
Guido van Rossumd57fd912000-03-10 22:53:23 +000015190void
Thomas Wouters78890102000-07-22 19:25:51 +000015191_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015192{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015193 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015194
Serhiy Storchaka05997252013-01-26 12:14:02 +020015195 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015196
Serhiy Storchaka05997252013-01-26 12:14:02 +020015197 for (i = 0; i < 256; i++)
15198 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015199 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015200 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015201}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015202
Walter Dörwald16807132007-05-25 13:52:07 +000015203void
15204PyUnicode_InternInPlace(PyObject **p)
15205{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015206 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015207 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015208#ifdef Py_DEBUG
15209 assert(s != NULL);
15210 assert(_PyUnicode_CHECK(s));
15211#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015212 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015213 return;
15214#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015215 /* If it's a subclass, we don't really know what putting
15216 it in the interned dict might do. */
15217 if (!PyUnicode_CheckExact(s))
15218 return;
15219 if (PyUnicode_CHECK_INTERNED(s))
15220 return;
15221 if (interned == NULL) {
15222 interned = PyDict_New();
15223 if (interned == NULL) {
15224 PyErr_Clear(); /* Don't leave an exception */
15225 return;
15226 }
15227 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015228 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015229 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015230 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015231 if (t == NULL) {
15232 PyErr_Clear();
15233 return;
15234 }
15235 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015236 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015237 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015238 return;
15239 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015240 /* The two references in interned are not counted by refcnt.
15241 The deallocator will take care of this */
15242 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015243 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015244}
15245
15246void
15247PyUnicode_InternImmortal(PyObject **p)
15248{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015249 PyUnicode_InternInPlace(p);
15250 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015251 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015252 Py_INCREF(*p);
15253 }
Walter Dörwald16807132007-05-25 13:52:07 +000015254}
15255
15256PyObject *
15257PyUnicode_InternFromString(const char *cp)
15258{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015259 PyObject *s = PyUnicode_FromString(cp);
15260 if (s == NULL)
15261 return NULL;
15262 PyUnicode_InternInPlace(&s);
15263 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015264}
15265
Alexander Belopolsky40018472011-02-26 01:02:56 +000015266void
15267_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015268{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015269 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015270 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015271 Py_ssize_t i, n;
15272 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015273
Benjamin Peterson14339b62009-01-31 16:36:08 +000015274 if (interned == NULL || !PyDict_Check(interned))
15275 return;
15276 keys = PyDict_Keys(interned);
15277 if (keys == NULL || !PyList_Check(keys)) {
15278 PyErr_Clear();
15279 return;
15280 }
Walter Dörwald16807132007-05-25 13:52:07 +000015281
Benjamin Peterson14339b62009-01-31 16:36:08 +000015282 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15283 detector, interned unicode strings are not forcibly deallocated;
15284 rather, we give them their stolen references back, and then clear
15285 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015286
Benjamin Peterson14339b62009-01-31 16:36:08 +000015287 n = PyList_GET_SIZE(keys);
15288 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015289 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015290 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015291 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015292 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015293 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015294 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015295 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015296 case SSTATE_NOT_INTERNED:
15297 /* XXX Shouldn't happen */
15298 break;
15299 case SSTATE_INTERNED_IMMORTAL:
15300 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015301 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015302 break;
15303 case SSTATE_INTERNED_MORTAL:
15304 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015305 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015306 break;
15307 default:
15308 Py_FatalError("Inconsistent interned string state.");
15309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015310 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015311 }
15312 fprintf(stderr, "total size of all interned strings: "
15313 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15314 "mortal/immortal\n", mortal_size, immortal_size);
15315 Py_DECREF(keys);
15316 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015317 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015318}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015319
15320
15321/********************* Unicode Iterator **************************/
15322
15323typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015324 PyObject_HEAD
15325 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015326 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015327} unicodeiterobject;
15328
15329static void
15330unicodeiter_dealloc(unicodeiterobject *it)
15331{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015332 _PyObject_GC_UNTRACK(it);
15333 Py_XDECREF(it->it_seq);
15334 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015335}
15336
15337static int
15338unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15339{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015340 Py_VISIT(it->it_seq);
15341 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015342}
15343
15344static PyObject *
15345unicodeiter_next(unicodeiterobject *it)
15346{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015347 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015348
Benjamin Peterson14339b62009-01-31 16:36:08 +000015349 assert(it != NULL);
15350 seq = it->it_seq;
15351 if (seq == NULL)
15352 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015353 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015354
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015355 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15356 int kind = PyUnicode_KIND(seq);
15357 void *data = PyUnicode_DATA(seq);
15358 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15359 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015360 if (item != NULL)
15361 ++it->it_index;
15362 return item;
15363 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015364
Benjamin Peterson14339b62009-01-31 16:36:08 +000015365 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015366 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015367 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015368}
15369
15370static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015371unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015372{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015373 Py_ssize_t len = 0;
15374 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015375 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015376 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015377}
15378
15379PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15380
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015381static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015382unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015383{
15384 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015385 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015386 it->it_seq, it->it_index);
15387 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015388 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015389 if (u == NULL)
15390 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015391 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015392 }
15393}
15394
15395PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15396
15397static PyObject *
15398unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15399{
15400 Py_ssize_t index = PyLong_AsSsize_t(state);
15401 if (index == -1 && PyErr_Occurred())
15402 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015403 if (it->it_seq != NULL) {
15404 if (index < 0)
15405 index = 0;
15406 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15407 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15408 it->it_index = index;
15409 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015410 Py_RETURN_NONE;
15411}
15412
15413PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15414
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015415static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015416 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015417 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015418 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15419 reduce_doc},
15420 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15421 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015422 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015423};
15424
15425PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015426 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15427 "str_iterator", /* tp_name */
15428 sizeof(unicodeiterobject), /* tp_basicsize */
15429 0, /* tp_itemsize */
15430 /* methods */
15431 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15432 0, /* tp_print */
15433 0, /* tp_getattr */
15434 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015435 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015436 0, /* tp_repr */
15437 0, /* tp_as_number */
15438 0, /* tp_as_sequence */
15439 0, /* tp_as_mapping */
15440 0, /* tp_hash */
15441 0, /* tp_call */
15442 0, /* tp_str */
15443 PyObject_GenericGetAttr, /* tp_getattro */
15444 0, /* tp_setattro */
15445 0, /* tp_as_buffer */
15446 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15447 0, /* tp_doc */
15448 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15449 0, /* tp_clear */
15450 0, /* tp_richcompare */
15451 0, /* tp_weaklistoffset */
15452 PyObject_SelfIter, /* tp_iter */
15453 (iternextfunc)unicodeiter_next, /* tp_iternext */
15454 unicodeiter_methods, /* tp_methods */
15455 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015456};
15457
15458static PyObject *
15459unicode_iter(PyObject *seq)
15460{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015461 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015462
Benjamin Peterson14339b62009-01-31 16:36:08 +000015463 if (!PyUnicode_Check(seq)) {
15464 PyErr_BadInternalCall();
15465 return NULL;
15466 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015467 if (PyUnicode_READY(seq) == -1)
15468 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015469 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15470 if (it == NULL)
15471 return NULL;
15472 it->it_index = 0;
15473 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015474 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015475 _PyObject_GC_TRACK(it);
15476 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015477}
15478
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015479
15480size_t
15481Py_UNICODE_strlen(const Py_UNICODE *u)
15482{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015483 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015484}
15485
15486Py_UNICODE*
15487Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15488{
15489 Py_UNICODE *u = s1;
15490 while ((*u++ = *s2++));
15491 return s1;
15492}
15493
15494Py_UNICODE*
15495Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15496{
15497 Py_UNICODE *u = s1;
15498 while ((*u++ = *s2++))
15499 if (n-- == 0)
15500 break;
15501 return s1;
15502}
15503
15504Py_UNICODE*
15505Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15506{
15507 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015508 u1 += wcslen(u1);
15509 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015510 return s1;
15511}
15512
15513int
15514Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15515{
15516 while (*s1 && *s2 && *s1 == *s2)
15517 s1++, s2++;
15518 if (*s1 && *s2)
15519 return (*s1 < *s2) ? -1 : +1;
15520 if (*s1)
15521 return 1;
15522 if (*s2)
15523 return -1;
15524 return 0;
15525}
15526
15527int
15528Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15529{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015530 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015531 for (; n != 0; n--) {
15532 u1 = *s1;
15533 u2 = *s2;
15534 if (u1 != u2)
15535 return (u1 < u2) ? -1 : +1;
15536 if (u1 == '\0')
15537 return 0;
15538 s1++;
15539 s2++;
15540 }
15541 return 0;
15542}
15543
15544Py_UNICODE*
15545Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15546{
15547 const Py_UNICODE *p;
15548 for (p = s; *p; p++)
15549 if (*p == c)
15550 return (Py_UNICODE*)p;
15551 return NULL;
15552}
15553
15554Py_UNICODE*
15555Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15556{
15557 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015558 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015559 while (p != s) {
15560 p--;
15561 if (*p == c)
15562 return (Py_UNICODE*)p;
15563 }
15564 return NULL;
15565}
Victor Stinner331ea922010-08-10 16:37:20 +000015566
Victor Stinner71133ff2010-09-01 23:43:53 +000015567Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015568PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015569{
Victor Stinner577db2c2011-10-11 22:12:48 +020015570 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015571 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015572
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015573 if (!PyUnicode_Check(unicode)) {
15574 PyErr_BadArgument();
15575 return NULL;
15576 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015577 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015578 if (u == NULL)
15579 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015580 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015581 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015582 PyErr_NoMemory();
15583 return NULL;
15584 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015585 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015586 size *= sizeof(Py_UNICODE);
15587 copy = PyMem_Malloc(size);
15588 if (copy == NULL) {
15589 PyErr_NoMemory();
15590 return NULL;
15591 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015592 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015593 return copy;
15594}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015595
Georg Brandl66c221e2010-10-14 07:04:07 +000015596/* A _string module, to export formatter_parser and formatter_field_name_split
15597 to the string.Formatter class implemented in Python. */
15598
15599static PyMethodDef _string_methods[] = {
15600 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15601 METH_O, PyDoc_STR("split the argument as a field name")},
15602 {"formatter_parser", (PyCFunction) formatter_parser,
15603 METH_O, PyDoc_STR("parse the argument as a format string")},
15604 {NULL, NULL}
15605};
15606
15607static struct PyModuleDef _string_module = {
15608 PyModuleDef_HEAD_INIT,
15609 "_string",
15610 PyDoc_STR("string helper module"),
15611 0,
15612 _string_methods,
15613 NULL,
15614 NULL,
15615 NULL,
15616 NULL
15617};
15618
15619PyMODINIT_FUNC
15620PyInit__string(void)
15621{
15622 return PyModule_Create(&_string_module);
15623}
15624
15625
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015626#ifdef __cplusplus
15627}
15628#endif