blob: 2b1db918a154797feade204a6b93da6e10e00fbd [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010043#include "pycore_fileutils.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010044#include "pycore_object.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010045#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050047#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070048#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000049
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000050#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000051#include <windows.h>
52#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000053
Larry Hastings61272b72014-01-07 12:41:53 -080054/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090055class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080056[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090057/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
58
59/*[python input]
60class Py_UCS4_converter(CConverter):
61 type = 'Py_UCS4'
62 converter = 'convert_uc'
63
64 def converter_init(self):
65 if self.default is not unspecified:
66 self.c_default = ascii(self.default)
67 if len(self.c_default) > 4 or self.c_default[0] != "'":
68 self.c_default = hex(ord(self.default))
69
70[python start generated code]*/
71/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080072
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000073/* --- Globals ------------------------------------------------------------
74
Serhiy Storchaka05997252013-01-26 12:14:02 +020075NOTE: In the interpreter's initialization phase, some globals are currently
76 initialized dynamically as needed. In the process Unicode objects may
77 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000078
79*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000080
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000081
82#ifdef __cplusplus
83extern "C" {
84#endif
85
Victor Stinner8faf8212011-12-08 22:14:11 +010086/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
87#define MAX_UNICODE 0x10ffff
88
Victor Stinner910337b2011-10-03 03:20:16 +020089#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020090# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020091#else
92# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
93#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020094
Victor Stinnere90fe6a2011-10-01 16:48:13 +020095#define _PyUnicode_UTF8(op) \
96 (((PyCompactUnicodeObject*)(op))->utf8)
97#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020098 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020099 assert(PyUnicode_IS_READY(op)), \
100 PyUnicode_IS_COMPACT_ASCII(op) ? \
101 ((char*)((PyASCIIObject*)(op) + 1)) : \
102 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200103#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104 (((PyCompactUnicodeObject*)(op))->utf8_length)
105#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200106 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 assert(PyUnicode_IS_READY(op)), \
108 PyUnicode_IS_COMPACT_ASCII(op) ? \
109 ((PyASCIIObject*)(op))->length : \
110 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_WSTR(op) \
112 (((PyASCIIObject*)(op))->wstr)
113#define _PyUnicode_WSTR_LENGTH(op) \
114 (((PyCompactUnicodeObject*)(op))->wstr_length)
115#define _PyUnicode_LENGTH(op) \
116 (((PyASCIIObject *)(op))->length)
117#define _PyUnicode_STATE(op) \
118 (((PyASCIIObject *)(op))->state)
119#define _PyUnicode_HASH(op) \
120 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200121#define _PyUnicode_KIND(op) \
122 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200123 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200124#define _PyUnicode_GET_LENGTH(op) \
125 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200127#define _PyUnicode_DATA_ANY(op) \
128 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200129
Victor Stinner910337b2011-10-03 03:20:16 +0200130#undef PyUnicode_READY
131#define PyUnicode_READY(op) \
132 (assert(_PyUnicode_CHECK(op)), \
133 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200134 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100135 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200136
Victor Stinnerc379ead2011-10-03 12:52:27 +0200137#define _PyUnicode_SHARE_UTF8(op) \
138 (assert(_PyUnicode_CHECK(op)), \
139 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
140 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
141#define _PyUnicode_SHARE_WSTR(op) \
142 (assert(_PyUnicode_CHECK(op)), \
143 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
144
Victor Stinner829c0ad2011-10-03 01:08:02 +0200145/* true if the Unicode object has an allocated UTF-8 memory block
146 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200147#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200148 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200149 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200150 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
151
Victor Stinner03490912011-10-03 23:45:12 +0200152/* true if the Unicode object has an allocated wstr memory block
153 (not shared with other data) */
154#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200155 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200156 (!PyUnicode_IS_READY(op) || \
157 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
158
Victor Stinner910337b2011-10-03 03:20:16 +0200159/* Generic helper macro to convert characters of different types.
160 from_type and to_type have to be valid type names, begin and end
161 are pointers to the source characters which should be of type
162 "from_type *". to is a pointer of type "to_type *" and points to the
163 buffer where the result characters are written to. */
164#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
165 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100166 to_type *_to = (to_type *)(to); \
167 const from_type *_iter = (from_type *)(begin); \
168 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200169 Py_ssize_t n = (_end) - (_iter); \
170 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200171 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200172 while (_iter < (_unrolled_end)) { \
173 _to[0] = (to_type) _iter[0]; \
174 _to[1] = (to_type) _iter[1]; \
175 _to[2] = (to_type) _iter[2]; \
176 _to[3] = (to_type) _iter[3]; \
177 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200178 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200179 while (_iter < (_end)) \
180 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200181 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200182
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200183#ifdef MS_WINDOWS
184 /* On Windows, overallocate by 50% is the best factor */
185# define OVERALLOCATE_FACTOR 2
186#else
187 /* On Linux, overallocate by 25% is the best factor */
188# define OVERALLOCATE_FACTOR 4
189#endif
190
Walter Dörwald16807132007-05-25 13:52:07 +0000191/* This dictionary holds all interned unicode strings. Note that references
192 to strings in this dictionary are *not* counted in the string's ob_refcnt.
193 When the interned string reaches a refcnt of 0 the string deallocation
194 function will delete the reference from this dictionary.
195
196 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000197 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000198*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200199static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000200
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200203
Serhiy Storchaka678db842013-01-26 12:16:36 +0200204#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200205 do { \
206 if (unicode_empty != NULL) \
207 Py_INCREF(unicode_empty); \
208 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200209 unicode_empty = PyUnicode_New(0, 0); \
210 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200211 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200212 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
213 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200214 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200215 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Serhiy Storchaka678db842013-01-26 12:16:36 +0200217#define _Py_RETURN_UNICODE_EMPTY() \
218 do { \
219 _Py_INCREF_UNICODE_EMPTY(); \
220 return unicode_empty; \
221 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000222
Victor Stinner59423e32018-11-26 13:40:01 +0100223static inline void
224unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
225 Py_ssize_t start, Py_ssize_t length)
226{
227 assert(0 <= start);
228 assert(kind != PyUnicode_WCHAR_KIND);
229 switch (kind) {
230 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100231 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100232 Py_UCS1 ch = (unsigned char)value;
233 Py_UCS1 *to = (Py_UCS1 *)data + start;
234 memset(to, ch, length);
235 break;
236 }
237 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100238 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100239 Py_UCS2 ch = (Py_UCS2)value;
240 Py_UCS2 *to = (Py_UCS2 *)data + start;
241 const Py_UCS2 *end = to + length;
242 for (; to < end; ++to) *to = ch;
243 break;
244 }
245 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100246 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100247 Py_UCS4 ch = value;
248 Py_UCS4 * to = (Py_UCS4 *)data + start;
249 const Py_UCS4 *end = to + length;
250 for (; to < end; ++to) *to = ch;
251 break;
252 }
253 default: Py_UNREACHABLE();
254 }
255}
256
257
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200258/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700259static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200260_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
261
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200262/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200263static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200264
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265/* Single character Unicode strings in the Latin-1 range are being
266 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200267static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000268
Christian Heimes190d79e2008-01-30 11:58:22 +0000269/* Fast detection of the most frequent whitespace characters */
270const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000271 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000272/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000273/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000274/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000275/* case 0x000C: * FORM FEED */
276/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 0, 1, 1, 1, 1, 1, 0, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000279/* case 0x001C: * FILE SEPARATOR */
280/* case 0x001D: * GROUP SEPARATOR */
281/* case 0x001E: * RECORD SEPARATOR */
282/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000284/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 1, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000289
Benjamin Peterson14339b62009-01-31 16:36:08 +0000290 0, 0, 0, 0, 0, 0, 0, 0,
291 0, 0, 0, 0, 0, 0, 0, 0,
292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000298};
299
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200300/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200301static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200302static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100303static int unicode_modifiable(PyObject *unicode);
304
Victor Stinnerfe226c02011-10-03 03:52:20 +0200305
Alexander Belopolsky40018472011-02-26 01:02:56 +0000306static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100307_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200308static PyObject *
309_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
310static PyObject *
311_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
312
313static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000314unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000315 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100316 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000317 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
318
Alexander Belopolsky40018472011-02-26 01:02:56 +0000319static void
320raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300321 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100322 PyObject *unicode,
323 Py_ssize_t startpos, Py_ssize_t endpos,
324 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000325
Christian Heimes190d79e2008-01-30 11:58:22 +0000326/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200327static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000328 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000329/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000330/* 0x000B, * LINE TABULATION */
331/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000332/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000333 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000334 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000335/* 0x001C, * FILE SEPARATOR */
336/* 0x001D, * GROUP SEPARATOR */
337/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000338 0, 0, 0, 0, 1, 1, 1, 0,
339 0, 0, 0, 0, 0, 0, 0, 0,
340 0, 0, 0, 0, 0, 0, 0, 0,
341 0, 0, 0, 0, 0, 0, 0, 0,
342 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000343
Benjamin Peterson14339b62009-01-31 16:36:08 +0000344 0, 0, 0, 0, 0, 0, 0, 0,
345 0, 0, 0, 0, 0, 0, 0, 0,
346 0, 0, 0, 0, 0, 0, 0, 0,
347 0, 0, 0, 0, 0, 0, 0, 0,
348 0, 0, 0, 0, 0, 0, 0, 0,
349 0, 0, 0, 0, 0, 0, 0, 0,
350 0, 0, 0, 0, 0, 0, 0, 0,
351 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000352};
353
INADA Naoki3ae20562017-01-16 20:41:20 +0900354static int convert_uc(PyObject *obj, void *addr);
355
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300356#include "clinic/unicodeobject.c.h"
357
Victor Stinner3d4226a2018-08-29 22:21:32 +0200358_Py_error_handler
359_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200360{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200361 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200362 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200363 }
364 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200365 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200366 }
367 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200368 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200369 }
370 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200371 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200372 }
373 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200374 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200375 }
376 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200377 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200378 }
379 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200380 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200381 }
Victor Stinner50149202015-09-22 00:26:54 +0200382 return _Py_ERROR_OTHER;
383}
384
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300385/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
386 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000387Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000388PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000389{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000390#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000391 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000392#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000393 /* This is actually an illegal character, so it should
394 not be passed to unichr. */
395 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000396#endif
397}
398
Victor Stinner910337b2011-10-03 03:20:16 +0200399#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200400int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100401_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200402{
Victor Stinner50fe3f82018-10-26 18:47:15 +0200403#define ASSERT(expr) _PyObject_ASSERT(op, (expr))
404
Victor Stinner910337b2011-10-03 03:20:16 +0200405 PyASCIIObject *ascii;
406 unsigned int kind;
407
Victor Stinner50fe3f82018-10-26 18:47:15 +0200408 ASSERT(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200409
410 ascii = (PyASCIIObject *)op;
411 kind = ascii->state.kind;
412
Victor Stinnera3b334d2011-10-03 13:53:37 +0200413 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200414 ASSERT(kind == PyUnicode_1BYTE_KIND);
415 ASSERT(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200416 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200417 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200418 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200419 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200420
Victor Stinnera41463c2011-10-04 01:05:08 +0200421 if (ascii->state.compact == 1) {
422 data = compact + 1;
Victor Stinner50fe3f82018-10-26 18:47:15 +0200423 ASSERT(kind == PyUnicode_1BYTE_KIND
Victor Stinner910337b2011-10-03 03:20:16 +0200424 || kind == PyUnicode_2BYTE_KIND
425 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner50fe3f82018-10-26 18:47:15 +0200426 ASSERT(ascii->state.ascii == 0);
427 ASSERT(ascii->state.ready == 1);
428 ASSERT (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100429 }
430 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200431 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
432
433 data = unicode->data.any;
434 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200435 ASSERT(ascii->length == 0);
436 ASSERT(ascii->hash == -1);
437 ASSERT(ascii->state.compact == 0);
438 ASSERT(ascii->state.ascii == 0);
439 ASSERT(ascii->state.ready == 0);
440 ASSERT(ascii->state.interned == SSTATE_NOT_INTERNED);
441 ASSERT(ascii->wstr != NULL);
442 ASSERT(data == NULL);
443 ASSERT(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200444 }
445 else {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200446 ASSERT(kind == PyUnicode_1BYTE_KIND
Victor Stinnera41463c2011-10-04 01:05:08 +0200447 || kind == PyUnicode_2BYTE_KIND
448 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner50fe3f82018-10-26 18:47:15 +0200449 ASSERT(ascii->state.compact == 0);
450 ASSERT(ascii->state.ready == 1);
451 ASSERT(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200452 if (ascii->state.ascii) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200453 ASSERT (compact->utf8 == data);
454 ASSERT (compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200455 }
456 else
Victor Stinner50fe3f82018-10-26 18:47:15 +0200457 ASSERT (compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200458 }
459 }
460 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200461 if (
462#if SIZEOF_WCHAR_T == 2
463 kind == PyUnicode_2BYTE_KIND
464#else
465 kind == PyUnicode_4BYTE_KIND
466#endif
467 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200468 {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200469 ASSERT(ascii->wstr == data);
470 ASSERT(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200471 } else
Victor Stinner50fe3f82018-10-26 18:47:15 +0200472 ASSERT(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200473 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200474
475 if (compact->utf8 == NULL)
Victor Stinner50fe3f82018-10-26 18:47:15 +0200476 ASSERT(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200477 if (ascii->wstr == NULL)
Victor Stinner50fe3f82018-10-26 18:47:15 +0200478 ASSERT(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200479 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200480 /* check that the best kind is used */
481 if (check_content && kind != PyUnicode_WCHAR_KIND)
482 {
483 Py_ssize_t i;
484 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200485 void *data;
486 Py_UCS4 ch;
487
488 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200489 for (i=0; i < ascii->length; i++)
490 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200491 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200492 if (ch > maxchar)
493 maxchar = ch;
494 }
495 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100496 if (ascii->state.ascii == 0) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200497 ASSERT(maxchar >= 128);
498 ASSERT(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100499 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200500 else
Victor Stinner50fe3f82018-10-26 18:47:15 +0200501 ASSERT(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200502 }
Victor Stinner77faf692011-11-20 18:56:05 +0100503 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200504 ASSERT(maxchar >= 0x100);
505 ASSERT(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100506 }
507 else {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200508 ASSERT(maxchar >= 0x10000);
509 ASSERT(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100510 }
Victor Stinner50fe3f82018-10-26 18:47:15 +0200511 ASSERT(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200512 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400513 return 1;
Victor Stinner50fe3f82018-10-26 18:47:15 +0200514
515#undef ASSERT
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400516}
Victor Stinner910337b2011-10-03 03:20:16 +0200517#endif
518
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100519static PyObject*
520unicode_result_wchar(PyObject *unicode)
521{
522#ifndef Py_DEBUG
523 Py_ssize_t len;
524
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100525 len = _PyUnicode_WSTR_LENGTH(unicode);
526 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100527 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200528 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100529 }
530
531 if (len == 1) {
532 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100533 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100534 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
535 Py_DECREF(unicode);
536 return latin1_char;
537 }
538 }
539
540 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200541 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100542 return NULL;
543 }
544#else
Victor Stinneraa771272012-10-04 02:32:58 +0200545 assert(Py_REFCNT(unicode) == 1);
546
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100547 /* don't make the result ready in debug mode to ensure that the caller
548 makes the string ready before using it */
549 assert(_PyUnicode_CheckConsistency(unicode, 1));
550#endif
551 return unicode;
552}
553
554static PyObject*
555unicode_result_ready(PyObject *unicode)
556{
557 Py_ssize_t length;
558
559 length = PyUnicode_GET_LENGTH(unicode);
560 if (length == 0) {
561 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100562 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200563 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100564 }
565 return unicode_empty;
566 }
567
568 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200569 void *data = PyUnicode_DATA(unicode);
570 int kind = PyUnicode_KIND(unicode);
571 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100572 if (ch < 256) {
573 PyObject *latin1_char = unicode_latin1[ch];
574 if (latin1_char != NULL) {
575 if (unicode != latin1_char) {
576 Py_INCREF(latin1_char);
577 Py_DECREF(unicode);
578 }
579 return latin1_char;
580 }
581 else {
582 assert(_PyUnicode_CheckConsistency(unicode, 1));
583 Py_INCREF(unicode);
584 unicode_latin1[ch] = unicode;
585 return unicode;
586 }
587 }
588 }
589
590 assert(_PyUnicode_CheckConsistency(unicode, 1));
591 return unicode;
592}
593
594static PyObject*
595unicode_result(PyObject *unicode)
596{
597 assert(_PyUnicode_CHECK(unicode));
598 if (PyUnicode_IS_READY(unicode))
599 return unicode_result_ready(unicode);
600 else
601 return unicode_result_wchar(unicode);
602}
603
Victor Stinnerc4b49542011-12-11 22:44:26 +0100604static PyObject*
605unicode_result_unchanged(PyObject *unicode)
606{
607 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500608 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100609 return NULL;
610 Py_INCREF(unicode);
611 return unicode;
612 }
613 else
614 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100615 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100616}
617
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200618/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
619 ASCII, Latin1, UTF-8, etc. */
620static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200621backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200622 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
623{
Victor Stinnerad771582015-10-09 12:38:53 +0200624 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200625 Py_UCS4 ch;
626 enum PyUnicode_Kind kind;
627 void *data;
628
629 assert(PyUnicode_IS_READY(unicode));
630 kind = PyUnicode_KIND(unicode);
631 data = PyUnicode_DATA(unicode);
632
633 size = 0;
634 /* determine replacement size */
635 for (i = collstart; i < collend; ++i) {
636 Py_ssize_t incr;
637
638 ch = PyUnicode_READ(kind, data, i);
639 if (ch < 0x100)
640 incr = 2+2;
641 else if (ch < 0x10000)
642 incr = 2+4;
643 else {
644 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200645 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200646 }
647 if (size > PY_SSIZE_T_MAX - incr) {
648 PyErr_SetString(PyExc_OverflowError,
649 "encoded result is too long for a Python string");
650 return NULL;
651 }
652 size += incr;
653 }
654
Victor Stinnerad771582015-10-09 12:38:53 +0200655 str = _PyBytesWriter_Prepare(writer, str, size);
656 if (str == NULL)
657 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200658
659 /* generate replacement */
660 for (i = collstart; i < collend; ++i) {
661 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200662 *str++ = '\\';
663 if (ch >= 0x00010000) {
664 *str++ = 'U';
665 *str++ = Py_hexdigits[(ch>>28)&0xf];
666 *str++ = Py_hexdigits[(ch>>24)&0xf];
667 *str++ = Py_hexdigits[(ch>>20)&0xf];
668 *str++ = Py_hexdigits[(ch>>16)&0xf];
669 *str++ = Py_hexdigits[(ch>>12)&0xf];
670 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200671 }
Victor Stinner797485e2015-10-09 03:17:30 +0200672 else if (ch >= 0x100) {
673 *str++ = 'u';
674 *str++ = Py_hexdigits[(ch>>12)&0xf];
675 *str++ = Py_hexdigits[(ch>>8)&0xf];
676 }
677 else
678 *str++ = 'x';
679 *str++ = Py_hexdigits[(ch>>4)&0xf];
680 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200681 }
682 return str;
683}
684
685/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
686 ASCII, Latin1, UTF-8, etc. */
687static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200688xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200689 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
690{
Victor Stinnerad771582015-10-09 12:38:53 +0200691 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200692 Py_UCS4 ch;
693 enum PyUnicode_Kind kind;
694 void *data;
695
696 assert(PyUnicode_IS_READY(unicode));
697 kind = PyUnicode_KIND(unicode);
698 data = PyUnicode_DATA(unicode);
699
700 size = 0;
701 /* determine replacement size */
702 for (i = collstart; i < collend; ++i) {
703 Py_ssize_t incr;
704
705 ch = PyUnicode_READ(kind, data, i);
706 if (ch < 10)
707 incr = 2+1+1;
708 else if (ch < 100)
709 incr = 2+2+1;
710 else if (ch < 1000)
711 incr = 2+3+1;
712 else if (ch < 10000)
713 incr = 2+4+1;
714 else if (ch < 100000)
715 incr = 2+5+1;
716 else if (ch < 1000000)
717 incr = 2+6+1;
718 else {
719 assert(ch <= MAX_UNICODE);
720 incr = 2+7+1;
721 }
722 if (size > PY_SSIZE_T_MAX - incr) {
723 PyErr_SetString(PyExc_OverflowError,
724 "encoded result is too long for a Python string");
725 return NULL;
726 }
727 size += incr;
728 }
729
Victor Stinnerad771582015-10-09 12:38:53 +0200730 str = _PyBytesWriter_Prepare(writer, str, size);
731 if (str == NULL)
732 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200733
734 /* generate replacement */
735 for (i = collstart; i < collend; ++i) {
736 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
737 }
738 return str;
739}
740
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741/* --- Bloom Filters ----------------------------------------------------- */
742
743/* stuff to implement simple "bloom filters" for Unicode characters.
744 to keep things simple, we use a single bitmask, using the least 5
745 bits from each unicode characters as the bit index. */
746
747/* the linebreak mask is set up by Unicode_Init below */
748
Antoine Pitrouf068f942010-01-13 14:19:12 +0000749#if LONG_BIT >= 128
750#define BLOOM_WIDTH 128
751#elif LONG_BIT >= 64
752#define BLOOM_WIDTH 64
753#elif LONG_BIT >= 32
754#define BLOOM_WIDTH 32
755#else
756#error "LONG_BIT is smaller than 32"
757#endif
758
Thomas Wouters477c8d52006-05-27 19:21:47 +0000759#define BLOOM_MASK unsigned long
760
Serhiy Storchaka05997252013-01-26 12:14:02 +0200761static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000762
Antoine Pitrouf068f942010-01-13 14:19:12 +0000763#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000764
Benjamin Peterson29060642009-01-31 22:14:21 +0000765#define BLOOM_LINEBREAK(ch) \
766 ((ch) < 128U ? ascii_linebreak[(ch)] : \
767 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000768
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700769static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200770make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000771{
Victor Stinnera85af502013-04-09 21:53:54 +0200772#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
773 do { \
774 TYPE *data = (TYPE *)PTR; \
775 TYPE *end = data + LEN; \
776 Py_UCS4 ch; \
777 for (; data != end; data++) { \
778 ch = *data; \
779 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
780 } \
781 break; \
782 } while (0)
783
Thomas Wouters477c8d52006-05-27 19:21:47 +0000784 /* calculate simple bloom-style bitmask for a given unicode string */
785
Antoine Pitrouf068f942010-01-13 14:19:12 +0000786 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000787
788 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200789 switch (kind) {
790 case PyUnicode_1BYTE_KIND:
791 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
792 break;
793 case PyUnicode_2BYTE_KIND:
794 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
795 break;
796 case PyUnicode_4BYTE_KIND:
797 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
798 break;
799 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700800 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200801 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000802 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200803
804#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000805}
806
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300807static int
808ensure_unicode(PyObject *obj)
809{
810 if (!PyUnicode_Check(obj)) {
811 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200812 "must be str, not %.100s",
813 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300814 return -1;
815 }
816 return PyUnicode_READY(obj);
817}
818
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200819/* Compilation of templated routines */
820
821#include "stringlib/asciilib.h"
822#include "stringlib/fastsearch.h"
823#include "stringlib/partition.h"
824#include "stringlib/split.h"
825#include "stringlib/count.h"
826#include "stringlib/find.h"
827#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200828#include "stringlib/undef.h"
829
830#include "stringlib/ucs1lib.h"
831#include "stringlib/fastsearch.h"
832#include "stringlib/partition.h"
833#include "stringlib/split.h"
834#include "stringlib/count.h"
835#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300836#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200837#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200838#include "stringlib/undef.h"
839
840#include "stringlib/ucs2lib.h"
841#include "stringlib/fastsearch.h"
842#include "stringlib/partition.h"
843#include "stringlib/split.h"
844#include "stringlib/count.h"
845#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300846#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200847#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200848#include "stringlib/undef.h"
849
850#include "stringlib/ucs4lib.h"
851#include "stringlib/fastsearch.h"
852#include "stringlib/partition.h"
853#include "stringlib/split.h"
854#include "stringlib/count.h"
855#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300856#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200857#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200858#include "stringlib/undef.h"
859
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200860#include "stringlib/unicodedefs.h"
861#include "stringlib/fastsearch.h"
862#include "stringlib/count.h"
863#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100864#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200865
Guido van Rossumd57fd912000-03-10 22:53:23 +0000866/* --- Unicode Object ----------------------------------------------------- */
867
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700868static inline Py_ssize_t
869findchar(const void *s, int kind,
870 Py_ssize_t size, Py_UCS4 ch,
871 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200872{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200873 switch (kind) {
874 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200875 if ((Py_UCS1) ch != ch)
876 return -1;
877 if (direction > 0)
878 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
879 else
880 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200881 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200882 if ((Py_UCS2) ch != ch)
883 return -1;
884 if (direction > 0)
885 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
886 else
887 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200888 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200889 if (direction > 0)
890 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
891 else
892 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200893 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700894 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200895 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200896}
897
Victor Stinnerafffce42012-10-03 23:03:17 +0200898#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000899/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200900 earlier.
901
902 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
903 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
904 invalid character in Unicode 6.0. */
905static void
906unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
907{
908 int kind = PyUnicode_KIND(unicode);
909 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
910 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
911 if (length <= old_length)
912 return;
913 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
914}
915#endif
916
Victor Stinnerfe226c02011-10-03 03:52:20 +0200917static PyObject*
918resize_compact(PyObject *unicode, Py_ssize_t length)
919{
920 Py_ssize_t char_size;
921 Py_ssize_t struct_size;
922 Py_ssize_t new_size;
923 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100924 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200925#ifdef Py_DEBUG
926 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
927#endif
928
Victor Stinner79891572012-05-03 13:43:07 +0200929 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200930 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100931 assert(PyUnicode_IS_COMPACT(unicode));
932
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200933 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100934 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200935 struct_size = sizeof(PyASCIIObject);
936 else
937 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200938 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200939
Victor Stinnerfe226c02011-10-03 03:52:20 +0200940 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
941 PyErr_NoMemory();
942 return NULL;
943 }
944 new_size = (struct_size + (length + 1) * char_size);
945
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200946 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
947 PyObject_DEL(_PyUnicode_UTF8(unicode));
948 _PyUnicode_UTF8(unicode) = NULL;
949 _PyUnicode_UTF8_LENGTH(unicode) = 0;
950 }
Victor Stinner84def372011-12-11 20:04:56 +0100951 _Py_DEC_REFTOTAL;
952 _Py_ForgetReference(unicode);
953
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300954 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100955 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100956 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200957 PyErr_NoMemory();
958 return NULL;
959 }
Victor Stinner84def372011-12-11 20:04:56 +0100960 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100962
Victor Stinnerfe226c02011-10-03 03:52:20 +0200963 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200964 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200965 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100966 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200967 _PyUnicode_WSTR_LENGTH(unicode) = length;
968 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100969 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
970 PyObject_DEL(_PyUnicode_WSTR(unicode));
971 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100972 if (!PyUnicode_IS_ASCII(unicode))
973 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100974 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200975#ifdef Py_DEBUG
976 unicode_fill_invalid(unicode, old_length);
977#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200978 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
979 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200980 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200981 return unicode;
982}
983
Alexander Belopolsky40018472011-02-26 01:02:56 +0000984static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200985resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000986{
Victor Stinner95663112011-10-04 01:03:50 +0200987 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100988 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200989 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200990 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000991
Victor Stinnerfe226c02011-10-03 03:52:20 +0200992 if (PyUnicode_IS_READY(unicode)) {
993 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200994 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200995 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200996#ifdef Py_DEBUG
997 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
998#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200999
1000 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001001 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001002 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1003 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001004
1005 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1006 PyErr_NoMemory();
1007 return -1;
1008 }
1009 new_size = (length + 1) * char_size;
1010
Victor Stinner7a9105a2011-12-12 00:13:42 +01001011 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1012 {
1013 PyObject_DEL(_PyUnicode_UTF8(unicode));
1014 _PyUnicode_UTF8(unicode) = NULL;
1015 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1016 }
1017
Victor Stinnerfe226c02011-10-03 03:52:20 +02001018 data = (PyObject *)PyObject_REALLOC(data, new_size);
1019 if (data == NULL) {
1020 PyErr_NoMemory();
1021 return -1;
1022 }
1023 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001024 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001025 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001026 _PyUnicode_WSTR_LENGTH(unicode) = length;
1027 }
1028 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001029 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001030 _PyUnicode_UTF8_LENGTH(unicode) = length;
1031 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001032 _PyUnicode_LENGTH(unicode) = length;
1033 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001034#ifdef Py_DEBUG
1035 unicode_fill_invalid(unicode, old_length);
1036#endif
Victor Stinner95663112011-10-04 01:03:50 +02001037 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001038 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001039 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001040 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041 }
Victor Stinner95663112011-10-04 01:03:50 +02001042 assert(_PyUnicode_WSTR(unicode) != NULL);
1043
1044 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001045 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001046 PyErr_NoMemory();
1047 return -1;
1048 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001049 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001050 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001051 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001052 if (!wstr) {
1053 PyErr_NoMemory();
1054 return -1;
1055 }
1056 _PyUnicode_WSTR(unicode) = wstr;
1057 _PyUnicode_WSTR(unicode)[length] = 0;
1058 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001059 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 return 0;
1061}
1062
Victor Stinnerfe226c02011-10-03 03:52:20 +02001063static PyObject*
1064resize_copy(PyObject *unicode, Py_ssize_t length)
1065{
1066 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001067 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001068 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001069
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001070 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001071
1072 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1073 if (copy == NULL)
1074 return NULL;
1075
1076 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001077 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001079 }
1080 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001081 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001082
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001083 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001084 if (w == NULL)
1085 return NULL;
1086 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1087 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001088 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001089 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001090 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001091 }
1092}
1093
Guido van Rossumd57fd912000-03-10 22:53:23 +00001094/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001095 Ux0000 terminated; some code (e.g. new_identifier)
1096 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001097
1098 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001099 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100
1101*/
1102
Alexander Belopolsky40018472011-02-26 01:02:56 +00001103static PyUnicodeObject *
1104_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001105{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001106 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108
Thomas Wouters477c8d52006-05-27 19:21:47 +00001109 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110 if (length == 0 && unicode_empty != NULL) {
1111 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001112 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113 }
1114
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001115 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001116 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001117 return (PyUnicodeObject *)PyErr_NoMemory();
1118 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119 if (length < 0) {
1120 PyErr_SetString(PyExc_SystemError,
1121 "Negative size passed to _PyUnicode_New");
1122 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123 }
1124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1126 if (unicode == NULL)
1127 return NULL;
1128 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001129
1130 _PyUnicode_WSTR_LENGTH(unicode) = length;
1131 _PyUnicode_HASH(unicode) = -1;
1132 _PyUnicode_STATE(unicode).interned = 0;
1133 _PyUnicode_STATE(unicode).kind = 0;
1134 _PyUnicode_STATE(unicode).compact = 0;
1135 _PyUnicode_STATE(unicode).ready = 0;
1136 _PyUnicode_STATE(unicode).ascii = 0;
1137 _PyUnicode_DATA_ANY(unicode) = NULL;
1138 _PyUnicode_LENGTH(unicode) = 0;
1139 _PyUnicode_UTF8(unicode) = NULL;
1140 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1143 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001144 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001145 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001146 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001147 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148
Jeremy Hyltond8082792003-09-16 19:41:39 +00001149 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001150 * the caller fails before initializing str -- unicode_resize()
1151 * reads str[0], and the Keep-Alive optimization can keep memory
1152 * allocated for str alive across a call to unicode_dealloc(unicode).
1153 * We don't want unicode_resize to read uninitialized memory in
1154 * that case.
1155 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001156 _PyUnicode_WSTR(unicode)[0] = 0;
1157 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001158
Victor Stinner7931d9a2011-11-04 00:22:48 +01001159 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001160 return unicode;
1161}
1162
Victor Stinnerf42dc442011-10-02 23:33:16 +02001163static const char*
1164unicode_kind_name(PyObject *unicode)
1165{
Victor Stinner42dfd712011-10-03 14:41:45 +02001166 /* don't check consistency: unicode_kind_name() is called from
1167 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001168 if (!PyUnicode_IS_COMPACT(unicode))
1169 {
1170 if (!PyUnicode_IS_READY(unicode))
1171 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001172 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001173 {
1174 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001175 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001176 return "legacy ascii";
1177 else
1178 return "legacy latin1";
1179 case PyUnicode_2BYTE_KIND:
1180 return "legacy UCS2";
1181 case PyUnicode_4BYTE_KIND:
1182 return "legacy UCS4";
1183 default:
1184 return "<legacy invalid kind>";
1185 }
1186 }
1187 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001188 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001189 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001190 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001191 return "ascii";
1192 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001193 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001194 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001195 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001196 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001197 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001198 default:
1199 return "<invalid compact kind>";
1200 }
1201}
1202
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001203#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001204/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001205char *_PyUnicode_utf8(void *unicode_raw){
1206 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001207 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001208}
1209
Victor Stinnera42de742018-11-22 10:25:22 +01001210void *_PyUnicode_compact_data(void *unicode_raw) {
1211 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001212 return _PyUnicode_COMPACT_DATA(unicode);
1213}
Victor Stinnera42de742018-11-22 10:25:22 +01001214void *_PyUnicode_data(void *unicode_raw) {
1215 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001216 printf("obj %p\n", unicode);
1217 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1218 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1219 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1220 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1221 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1222 return PyUnicode_DATA(unicode);
1223}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001224
1225void
1226_PyUnicode_Dump(PyObject *op)
1227{
1228 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001229 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1230 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1231 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001232
Victor Stinnera849a4b2011-10-03 12:12:11 +02001233 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001234 {
1235 if (ascii->state.ascii)
1236 data = (ascii + 1);
1237 else
1238 data = (compact + 1);
1239 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001240 else
1241 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001242 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1243 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001244
Victor Stinnera849a4b2011-10-03 12:12:11 +02001245 if (ascii->wstr == data)
1246 printf("shared ");
1247 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001248
Victor Stinnera3b334d2011-10-03 13:53:37 +02001249 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001250 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001251 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1252 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001253 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1254 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001255 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001256 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001257}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001258#endif
1259
1260PyObject *
1261PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1262{
1263 PyObject *obj;
1264 PyCompactUnicodeObject *unicode;
1265 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001266 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001267 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001268 Py_ssize_t char_size;
1269 Py_ssize_t struct_size;
1270
1271 /* Optimization for empty strings */
1272 if (size == 0 && unicode_empty != NULL) {
1273 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001274 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001275 }
1276
Victor Stinner9e9d6892011-10-04 01:02:02 +02001277 is_ascii = 0;
1278 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001279 struct_size = sizeof(PyCompactUnicodeObject);
1280 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001281 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001282 char_size = 1;
1283 is_ascii = 1;
1284 struct_size = sizeof(PyASCIIObject);
1285 }
1286 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001287 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001288 char_size = 1;
1289 }
1290 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001291 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001292 char_size = 2;
1293 if (sizeof(wchar_t) == 2)
1294 is_sharing = 1;
1295 }
1296 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001297 if (maxchar > MAX_UNICODE) {
1298 PyErr_SetString(PyExc_SystemError,
1299 "invalid maximum character passed to PyUnicode_New");
1300 return NULL;
1301 }
Victor Stinner8f825062012-04-27 13:55:39 +02001302 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001303 char_size = 4;
1304 if (sizeof(wchar_t) == 4)
1305 is_sharing = 1;
1306 }
1307
1308 /* Ensure we won't overflow the size. */
1309 if (size < 0) {
1310 PyErr_SetString(PyExc_SystemError,
1311 "Negative size passed to PyUnicode_New");
1312 return NULL;
1313 }
1314 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1315 return PyErr_NoMemory();
1316
1317 /* Duplicated allocation code from _PyObject_New() instead of a call to
1318 * PyObject_New() so we are able to allocate space for the object and
1319 * it's data buffer.
1320 */
1321 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1322 if (obj == NULL)
1323 return PyErr_NoMemory();
1324 obj = PyObject_INIT(obj, &PyUnicode_Type);
1325 if (obj == NULL)
1326 return NULL;
1327
1328 unicode = (PyCompactUnicodeObject *)obj;
1329 if (is_ascii)
1330 data = ((PyASCIIObject*)obj) + 1;
1331 else
1332 data = unicode + 1;
1333 _PyUnicode_LENGTH(unicode) = size;
1334 _PyUnicode_HASH(unicode) = -1;
1335 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001336 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337 _PyUnicode_STATE(unicode).compact = 1;
1338 _PyUnicode_STATE(unicode).ready = 1;
1339 _PyUnicode_STATE(unicode).ascii = is_ascii;
1340 if (is_ascii) {
1341 ((char*)data)[size] = 0;
1342 _PyUnicode_WSTR(unicode) = NULL;
1343 }
Victor Stinner8f825062012-04-27 13:55:39 +02001344 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001345 ((char*)data)[size] = 0;
1346 _PyUnicode_WSTR(unicode) = NULL;
1347 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001348 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001349 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001350 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001351 else {
1352 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001353 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001354 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001355 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001356 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357 ((Py_UCS4*)data)[size] = 0;
1358 if (is_sharing) {
1359 _PyUnicode_WSTR_LENGTH(unicode) = size;
1360 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1361 }
1362 else {
1363 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1364 _PyUnicode_WSTR(unicode) = NULL;
1365 }
1366 }
Victor Stinner8f825062012-04-27 13:55:39 +02001367#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001368 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001369#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001370 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371 return obj;
1372}
1373
1374#if SIZEOF_WCHAR_T == 2
1375/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1376 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001377 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001378
1379 This function assumes that unicode can hold one more code point than wstr
1380 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001381static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001383 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384{
1385 const wchar_t *iter;
1386 Py_UCS4 *ucs4_out;
1387
Victor Stinner910337b2011-10-03 03:20:16 +02001388 assert(unicode != NULL);
1389 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1391 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1392
1393 for (iter = begin; iter < end; ) {
1394 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1395 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001396 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1397 && (iter+1) < end
1398 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 {
Victor Stinner551ac952011-11-29 22:58:13 +01001400 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401 iter += 2;
1402 }
1403 else {
1404 *ucs4_out++ = *iter;
1405 iter++;
1406 }
1407 }
1408 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1409 _PyUnicode_GET_LENGTH(unicode)));
1410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411}
1412#endif
1413
Victor Stinnercd9950f2011-10-02 00:34:53 +02001414static int
Victor Stinner488fa492011-12-12 00:01:39 +01001415unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001416{
Victor Stinner488fa492011-12-12 00:01:39 +01001417 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001418 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001419 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001420 return -1;
1421 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001422 return 0;
1423}
1424
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001425static int
1426_copy_characters(PyObject *to, Py_ssize_t to_start,
1427 PyObject *from, Py_ssize_t from_start,
1428 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001429{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001430 unsigned int from_kind, to_kind;
1431 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001432
Victor Stinneree4544c2012-05-09 22:24:08 +02001433 assert(0 <= how_many);
1434 assert(0 <= from_start);
1435 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001436 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001437 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001438 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439
Victor Stinnerd3f08822012-05-29 12:57:52 +02001440 assert(PyUnicode_Check(to));
1441 assert(PyUnicode_IS_READY(to));
1442 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1443
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001444 if (how_many == 0)
1445 return 0;
1446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001448 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001449 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001450 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001451
Victor Stinnerf1852262012-06-16 16:38:26 +02001452#ifdef Py_DEBUG
1453 if (!check_maxchar
1454 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1455 {
1456 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1457 Py_UCS4 ch;
1458 Py_ssize_t i;
1459 for (i=0; i < how_many; i++) {
1460 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1461 assert(ch <= to_maxchar);
1462 }
1463 }
1464#endif
1465
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001466 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001467 if (check_maxchar
1468 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1469 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001470 /* Writing Latin-1 characters into an ASCII string requires to
1471 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001472 Py_UCS4 max_char;
1473 max_char = ucs1lib_find_max_char(from_data,
1474 (Py_UCS1*)from_data + how_many);
1475 if (max_char >= 128)
1476 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001477 }
Christian Heimesf051e432016-09-13 20:22:02 +02001478 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001479 (char*)from_data + from_kind * from_start,
1480 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001482 else if (from_kind == PyUnicode_1BYTE_KIND
1483 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001484 {
1485 _PyUnicode_CONVERT_BYTES(
1486 Py_UCS1, Py_UCS2,
1487 PyUnicode_1BYTE_DATA(from) + from_start,
1488 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1489 PyUnicode_2BYTE_DATA(to) + to_start
1490 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001491 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001492 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001493 && to_kind == PyUnicode_4BYTE_KIND)
1494 {
1495 _PyUnicode_CONVERT_BYTES(
1496 Py_UCS1, Py_UCS4,
1497 PyUnicode_1BYTE_DATA(from) + from_start,
1498 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1499 PyUnicode_4BYTE_DATA(to) + to_start
1500 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001501 }
1502 else if (from_kind == PyUnicode_2BYTE_KIND
1503 && to_kind == PyUnicode_4BYTE_KIND)
1504 {
1505 _PyUnicode_CONVERT_BYTES(
1506 Py_UCS2, Py_UCS4,
1507 PyUnicode_2BYTE_DATA(from) + from_start,
1508 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1509 PyUnicode_4BYTE_DATA(to) + to_start
1510 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001511 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001512 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001513 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1514
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001515 if (!check_maxchar) {
1516 if (from_kind == PyUnicode_2BYTE_KIND
1517 && to_kind == PyUnicode_1BYTE_KIND)
1518 {
1519 _PyUnicode_CONVERT_BYTES(
1520 Py_UCS2, Py_UCS1,
1521 PyUnicode_2BYTE_DATA(from) + from_start,
1522 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1523 PyUnicode_1BYTE_DATA(to) + to_start
1524 );
1525 }
1526 else if (from_kind == PyUnicode_4BYTE_KIND
1527 && to_kind == PyUnicode_1BYTE_KIND)
1528 {
1529 _PyUnicode_CONVERT_BYTES(
1530 Py_UCS4, Py_UCS1,
1531 PyUnicode_4BYTE_DATA(from) + from_start,
1532 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1533 PyUnicode_1BYTE_DATA(to) + to_start
1534 );
1535 }
1536 else if (from_kind == PyUnicode_4BYTE_KIND
1537 && to_kind == PyUnicode_2BYTE_KIND)
1538 {
1539 _PyUnicode_CONVERT_BYTES(
1540 Py_UCS4, Py_UCS2,
1541 PyUnicode_4BYTE_DATA(from) + from_start,
1542 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1543 PyUnicode_2BYTE_DATA(to) + to_start
1544 );
1545 }
1546 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001547 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001548 }
1549 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001550 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001551 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001552 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001553 Py_ssize_t i;
1554
Victor Stinnera0702ab2011-09-29 14:14:38 +02001555 for (i=0; i < how_many; i++) {
1556 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001557 if (ch > to_maxchar)
1558 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001559 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1560 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001561 }
1562 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001563 return 0;
1564}
1565
Victor Stinnerd3f08822012-05-29 12:57:52 +02001566void
1567_PyUnicode_FastCopyCharacters(
1568 PyObject *to, Py_ssize_t to_start,
1569 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001570{
1571 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1572}
1573
1574Py_ssize_t
1575PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1576 PyObject *from, Py_ssize_t from_start,
1577 Py_ssize_t how_many)
1578{
1579 int err;
1580
1581 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1582 PyErr_BadInternalCall();
1583 return -1;
1584 }
1585
Benjamin Petersonbac79492012-01-14 13:34:47 -05001586 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001587 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001588 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001589 return -1;
1590
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001591 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001592 PyErr_SetString(PyExc_IndexError, "string index out of range");
1593 return -1;
1594 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001595 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001596 PyErr_SetString(PyExc_IndexError, "string index out of range");
1597 return -1;
1598 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001599 if (how_many < 0) {
1600 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1601 return -1;
1602 }
1603 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001604 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1605 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001606 "Cannot write %zi characters at %zi "
1607 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001608 how_many, to_start, PyUnicode_GET_LENGTH(to));
1609 return -1;
1610 }
1611
1612 if (how_many == 0)
1613 return 0;
1614
Victor Stinner488fa492011-12-12 00:01:39 +01001615 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001616 return -1;
1617
1618 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1619 if (err) {
1620 PyErr_Format(PyExc_SystemError,
1621 "Cannot copy %s characters "
1622 "into a string of %s characters",
1623 unicode_kind_name(from),
1624 unicode_kind_name(to));
1625 return -1;
1626 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001627 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001628}
1629
Victor Stinner17222162011-09-28 22:15:37 +02001630/* Find the maximum code point and count the number of surrogate pairs so a
1631 correct string length can be computed before converting a string to UCS4.
1632 This function counts single surrogates as a character and not as a pair.
1633
1634 Return 0 on success, or -1 on error. */
1635static int
1636find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1637 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638{
1639 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001640 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001641
Victor Stinnerc53be962011-10-02 21:33:54 +02001642 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001643 *num_surrogates = 0;
1644 *maxchar = 0;
1645
1646 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001647#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001648 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1649 && (iter+1) < end
1650 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1651 {
1652 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1653 ++(*num_surrogates);
1654 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001655 }
1656 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001658 {
1659 ch = *iter;
1660 iter++;
1661 }
1662 if (ch > *maxchar) {
1663 *maxchar = ch;
1664 if (*maxchar > MAX_UNICODE) {
1665 PyErr_Format(PyExc_ValueError,
1666 "character U+%x is not in range [U+0000; U+10ffff]",
1667 ch);
1668 return -1;
1669 }
1670 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001671 }
1672 return 0;
1673}
1674
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001675int
1676_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001677{
1678 wchar_t *end;
1679 Py_UCS4 maxchar = 0;
1680 Py_ssize_t num_surrogates;
1681#if SIZEOF_WCHAR_T == 2
1682 Py_ssize_t length_wo_surrogates;
1683#endif
1684
Georg Brandl7597add2011-10-05 16:36:47 +02001685 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001686 strings were created using _PyObject_New() and where no canonical
1687 representation (the str field) has been set yet aka strings
1688 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001689 assert(_PyUnicode_CHECK(unicode));
1690 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001691 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001692 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001693 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001694 /* Actually, it should neither be interned nor be anything else: */
1695 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001697 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001698 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001699 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001701
1702 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001703 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1704 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001705 PyErr_NoMemory();
1706 return -1;
1707 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001708 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001709 _PyUnicode_WSTR(unicode), end,
1710 PyUnicode_1BYTE_DATA(unicode));
1711 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1712 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1713 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1714 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001715 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001716 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001717 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001718 }
1719 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001720 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001721 _PyUnicode_UTF8(unicode) = NULL;
1722 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001723 }
1724 PyObject_FREE(_PyUnicode_WSTR(unicode));
1725 _PyUnicode_WSTR(unicode) = NULL;
1726 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1727 }
1728 /* In this case we might have to convert down from 4-byte native
1729 wchar_t to 2-byte unicode. */
1730 else if (maxchar < 65536) {
1731 assert(num_surrogates == 0 &&
1732 "FindMaxCharAndNumSurrogatePairs() messed up");
1733
Victor Stinner506f5922011-09-28 22:34:18 +02001734#if SIZEOF_WCHAR_T == 2
1735 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001736 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001737 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1738 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1739 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001740 _PyUnicode_UTF8(unicode) = NULL;
1741 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001742#else
1743 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001744 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001745 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001746 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001747 PyErr_NoMemory();
1748 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 }
Victor Stinner506f5922011-09-28 22:34:18 +02001750 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1751 _PyUnicode_WSTR(unicode), end,
1752 PyUnicode_2BYTE_DATA(unicode));
1753 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1754 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1755 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001756 _PyUnicode_UTF8(unicode) = NULL;
1757 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001758 PyObject_FREE(_PyUnicode_WSTR(unicode));
1759 _PyUnicode_WSTR(unicode) = NULL;
1760 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1761#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 }
1763 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1764 else {
1765#if SIZEOF_WCHAR_T == 2
1766 /* in case the native representation is 2-bytes, we need to allocate a
1767 new normalized 4-byte version. */
1768 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001769 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1770 PyErr_NoMemory();
1771 return -1;
1772 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001773 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1774 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775 PyErr_NoMemory();
1776 return -1;
1777 }
1778 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1779 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001780 _PyUnicode_UTF8(unicode) = NULL;
1781 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001782 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1783 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001784 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001785 PyObject_FREE(_PyUnicode_WSTR(unicode));
1786 _PyUnicode_WSTR(unicode) = NULL;
1787 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1788#else
1789 assert(num_surrogates == 0);
1790
Victor Stinnerc3c74152011-10-02 20:39:55 +02001791 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001792 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001793 _PyUnicode_UTF8(unicode) = NULL;
1794 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001795 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1796#endif
1797 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1798 }
1799 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001800 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001801 return 0;
1802}
1803
Alexander Belopolsky40018472011-02-26 01:02:56 +00001804static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001805unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806{
Walter Dörwald16807132007-05-25 13:52:07 +00001807 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001808 case SSTATE_NOT_INTERNED:
1809 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001810
Benjamin Peterson29060642009-01-31 22:14:21 +00001811 case SSTATE_INTERNED_MORTAL:
1812 /* revive dead object temporarily for DelItem */
1813 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001814 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001815 Py_FatalError(
1816 "deletion of interned string failed");
1817 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001818
Benjamin Peterson29060642009-01-31 22:14:21 +00001819 case SSTATE_INTERNED_IMMORTAL:
1820 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001821 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001822
Benjamin Peterson29060642009-01-31 22:14:21 +00001823 default:
1824 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001825 }
1826
Victor Stinner03490912011-10-03 23:45:12 +02001827 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001828 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001829 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001830 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001831 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1832 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001833
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001834 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835}
1836
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001837#ifdef Py_DEBUG
1838static int
1839unicode_is_singleton(PyObject *unicode)
1840{
1841 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1842 if (unicode == unicode_empty)
1843 return 1;
1844 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1845 {
1846 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1847 if (ch < 256 && unicode_latin1[ch] == unicode)
1848 return 1;
1849 }
1850 return 0;
1851}
1852#endif
1853
Alexander Belopolsky40018472011-02-26 01:02:56 +00001854static int
Victor Stinner488fa492011-12-12 00:01:39 +01001855unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001856{
Victor Stinner488fa492011-12-12 00:01:39 +01001857 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001858 if (Py_REFCNT(unicode) != 1)
1859 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001860 if (_PyUnicode_HASH(unicode) != -1)
1861 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001862 if (PyUnicode_CHECK_INTERNED(unicode))
1863 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001864 if (!PyUnicode_CheckExact(unicode))
1865 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001866#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001867 /* singleton refcount is greater than 1 */
1868 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001869#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001870 return 1;
1871}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001872
Victor Stinnerfe226c02011-10-03 03:52:20 +02001873static int
1874unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1875{
1876 PyObject *unicode;
1877 Py_ssize_t old_length;
1878
1879 assert(p_unicode != NULL);
1880 unicode = *p_unicode;
1881
1882 assert(unicode != NULL);
1883 assert(PyUnicode_Check(unicode));
1884 assert(0 <= length);
1885
Victor Stinner910337b2011-10-03 03:20:16 +02001886 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001887 old_length = PyUnicode_WSTR_LENGTH(unicode);
1888 else
1889 old_length = PyUnicode_GET_LENGTH(unicode);
1890 if (old_length == length)
1891 return 0;
1892
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001893 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001894 _Py_INCREF_UNICODE_EMPTY();
1895 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001896 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001897 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001898 return 0;
1899 }
1900
Victor Stinner488fa492011-12-12 00:01:39 +01001901 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001902 PyObject *copy = resize_copy(unicode, length);
1903 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001904 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001905 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001906 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001907 }
1908
Victor Stinnerfe226c02011-10-03 03:52:20 +02001909 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001910 PyObject *new_unicode = resize_compact(unicode, length);
1911 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001912 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001913 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001914 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001915 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001916 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001917}
1918
Alexander Belopolsky40018472011-02-26 01:02:56 +00001919int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001920PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001921{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001922 PyObject *unicode;
1923 if (p_unicode == NULL) {
1924 PyErr_BadInternalCall();
1925 return -1;
1926 }
1927 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001928 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001929 {
1930 PyErr_BadInternalCall();
1931 return -1;
1932 }
1933 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001934}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001935
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001936/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001937
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001938 WARNING: The function doesn't copy the terminating null character and
1939 doesn't check the maximum character (may write a latin1 character in an
1940 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001941static void
1942unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1943 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001944{
1945 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1946 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001947 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001948
1949 switch (kind) {
1950 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001951 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001952#ifdef Py_DEBUG
1953 if (PyUnicode_IS_ASCII(unicode)) {
1954 Py_UCS4 maxchar = ucs1lib_find_max_char(
1955 (const Py_UCS1*)str,
1956 (const Py_UCS1*)str + len);
1957 assert(maxchar < 128);
1958 }
1959#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001960 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001961 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001962 }
1963 case PyUnicode_2BYTE_KIND: {
1964 Py_UCS2 *start = (Py_UCS2 *)data + index;
1965 Py_UCS2 *ucs2 = start;
1966 assert(index <= PyUnicode_GET_LENGTH(unicode));
1967
Victor Stinner184252a2012-06-16 02:57:41 +02001968 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001969 *ucs2 = (Py_UCS2)*str;
1970
1971 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001972 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001973 }
1974 default: {
1975 Py_UCS4 *start = (Py_UCS4 *)data + index;
1976 Py_UCS4 *ucs4 = start;
1977 assert(kind == PyUnicode_4BYTE_KIND);
1978 assert(index <= PyUnicode_GET_LENGTH(unicode));
1979
Victor Stinner184252a2012-06-16 02:57:41 +02001980 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001981 *ucs4 = (Py_UCS4)*str;
1982
1983 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001984 }
1985 }
1986}
1987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988static PyObject*
1989get_latin1_char(unsigned char ch)
1990{
Victor Stinnera464fc12011-10-02 20:39:30 +02001991 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001993 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001994 if (!unicode)
1995 return NULL;
1996 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001997 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001998 unicode_latin1[ch] = unicode;
1999 }
2000 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002001 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002}
2003
Victor Stinner985a82a2014-01-03 12:53:47 +01002004static PyObject*
2005unicode_char(Py_UCS4 ch)
2006{
2007 PyObject *unicode;
2008
2009 assert(ch <= MAX_UNICODE);
2010
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002011 if (ch < 256)
2012 return get_latin1_char(ch);
2013
Victor Stinner985a82a2014-01-03 12:53:47 +01002014 unicode = PyUnicode_New(1, ch);
2015 if (unicode == NULL)
2016 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002017
2018 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2019 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002020 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002021 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002022 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2023 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2024 }
2025 assert(_PyUnicode_CheckConsistency(unicode, 1));
2026 return unicode;
2027}
2028
Alexander Belopolsky40018472011-02-26 01:02:56 +00002029PyObject *
2030PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002031{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002032 if (u == NULL)
2033 return (PyObject*)_PyUnicode_New(size);
2034
2035 if (size < 0) {
2036 PyErr_BadInternalCall();
2037 return NULL;
2038 }
2039
2040 return PyUnicode_FromWideChar(u, size);
2041}
2042
2043PyObject *
2044PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2045{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002046 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002047 Py_UCS4 maxchar = 0;
2048 Py_ssize_t num_surrogates;
2049
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002050 if (u == NULL && size != 0) {
2051 PyErr_BadInternalCall();
2052 return NULL;
2053 }
2054
2055 if (size == -1) {
2056 size = wcslen(u);
2057 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002059 /* If the Unicode data is known at construction time, we can apply
2060 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002061
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002062 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002063 if (size == 0)
2064 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002066 /* Single character Unicode objects in the Latin-1 range are
2067 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002068 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002069 return get_latin1_char((unsigned char)*u);
2070
2071 /* If not empty and not single character, copy the Unicode data
2072 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002073 if (find_maxchar_surrogates(u, u + size,
2074 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002075 return NULL;
2076
Victor Stinner8faf8212011-12-08 22:14:11 +01002077 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078 if (!unicode)
2079 return NULL;
2080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 switch (PyUnicode_KIND(unicode)) {
2082 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002083 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002084 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2085 break;
2086 case PyUnicode_2BYTE_KIND:
2087#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002088 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002089#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002090 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002091 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2092#endif
2093 break;
2094 case PyUnicode_4BYTE_KIND:
2095#if SIZEOF_WCHAR_T == 2
2096 /* This is the only case which has to process surrogates, thus
2097 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002098 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002099#else
2100 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002101 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002102#endif
2103 break;
2104 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002105 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002106 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002107
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002108 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109}
2110
Alexander Belopolsky40018472011-02-26 01:02:56 +00002111PyObject *
2112PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002113{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002114 if (size < 0) {
2115 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002116 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002117 return NULL;
2118 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002119 if (u != NULL)
2120 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2121 else
2122 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002123}
2124
Alexander Belopolsky40018472011-02-26 01:02:56 +00002125PyObject *
2126PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002127{
2128 size_t size = strlen(u);
2129 if (size > PY_SSIZE_T_MAX) {
2130 PyErr_SetString(PyExc_OverflowError, "input too long");
2131 return NULL;
2132 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002133 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002134}
2135
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002136PyObject *
2137_PyUnicode_FromId(_Py_Identifier *id)
2138{
2139 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002140 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2141 strlen(id->string),
2142 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002143 if (!id->object)
2144 return NULL;
2145 PyUnicode_InternInPlace(&id->object);
2146 assert(!id->next);
2147 id->next = static_strings;
2148 static_strings = id;
2149 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002150 return id->object;
2151}
2152
2153void
2154_PyUnicode_ClearStaticStrings()
2155{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002156 _Py_Identifier *tmp, *s = static_strings;
2157 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002158 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002159 tmp = s->next;
2160 s->next = NULL;
2161 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002162 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002163 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002164}
2165
Benjamin Peterson0df54292012-03-26 14:50:32 -04002166/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002167
Victor Stinnerd3f08822012-05-29 12:57:52 +02002168PyObject*
2169_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002170{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002171 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002172 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002173 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002174#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002175 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002176#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002177 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002178 }
Victor Stinner785938e2011-12-11 20:09:03 +01002179 unicode = PyUnicode_New(size, 127);
2180 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002181 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002182 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2183 assert(_PyUnicode_CheckConsistency(unicode, 1));
2184 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002185}
2186
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002187static Py_UCS4
2188kind_maxchar_limit(unsigned int kind)
2189{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002190 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002191 case PyUnicode_1BYTE_KIND:
2192 return 0x80;
2193 case PyUnicode_2BYTE_KIND:
2194 return 0x100;
2195 case PyUnicode_4BYTE_KIND:
2196 return 0x10000;
2197 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002198 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002199 }
2200}
2201
Victor Stinner702c7342011-10-05 13:50:52 +02002202static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002203_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002204{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002206 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002207
Serhiy Storchaka678db842013-01-26 12:16:36 +02002208 if (size == 0)
2209 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002210 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002211 if (size == 1)
2212 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002213
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002214 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002215 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 if (!res)
2217 return NULL;
2218 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002219 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002221}
2222
Victor Stinnere57b1c02011-09-28 22:20:48 +02002223static PyObject*
2224_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225{
2226 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002227 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002228
Serhiy Storchaka678db842013-01-26 12:16:36 +02002229 if (size == 0)
2230 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002231 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002232 if (size == 1)
2233 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002234
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002235 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002236 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002237 if (!res)
2238 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002239 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002241 else {
2242 _PyUnicode_CONVERT_BYTES(
2243 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2244 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002245 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246 return res;
2247}
2248
Victor Stinnere57b1c02011-09-28 22:20:48 +02002249static PyObject*
2250_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002251{
2252 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002253 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002254
Serhiy Storchaka678db842013-01-26 12:16:36 +02002255 if (size == 0)
2256 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002257 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002258 if (size == 1)
2259 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002260
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002261 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002262 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 if (!res)
2264 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002265 if (max_char < 256)
2266 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2267 PyUnicode_1BYTE_DATA(res));
2268 else if (max_char < 0x10000)
2269 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2270 PyUnicode_2BYTE_DATA(res));
2271 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002272 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002273 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 return res;
2275}
2276
2277PyObject*
2278PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2279{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002280 if (size < 0) {
2281 PyErr_SetString(PyExc_ValueError, "size must be positive");
2282 return NULL;
2283 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002284 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002285 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002286 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002287 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002288 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002289 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002290 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002291 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002292 PyErr_SetString(PyExc_SystemError, "invalid kind");
2293 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002294 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002295}
2296
Victor Stinnerece58de2012-04-23 23:36:38 +02002297Py_UCS4
2298_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2299{
2300 enum PyUnicode_Kind kind;
2301 void *startptr, *endptr;
2302
2303 assert(PyUnicode_IS_READY(unicode));
2304 assert(0 <= start);
2305 assert(end <= PyUnicode_GET_LENGTH(unicode));
2306 assert(start <= end);
2307
2308 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2309 return PyUnicode_MAX_CHAR_VALUE(unicode);
2310
2311 if (start == end)
2312 return 127;
2313
Victor Stinner94d558b2012-04-27 22:26:58 +02002314 if (PyUnicode_IS_ASCII(unicode))
2315 return 127;
2316
Victor Stinnerece58de2012-04-23 23:36:38 +02002317 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002318 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002319 endptr = (char *)startptr + end * kind;
2320 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002321 switch(kind) {
2322 case PyUnicode_1BYTE_KIND:
2323 return ucs1lib_find_max_char(startptr, endptr);
2324 case PyUnicode_2BYTE_KIND:
2325 return ucs2lib_find_max_char(startptr, endptr);
2326 case PyUnicode_4BYTE_KIND:
2327 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002328 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002329 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002330 }
2331}
2332
Victor Stinner25a4b292011-10-06 12:31:55 +02002333/* Ensure that a string uses the most efficient storage, if it is not the
2334 case: create a new string with of the right kind. Write NULL into *p_unicode
2335 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002336static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002337unicode_adjust_maxchar(PyObject **p_unicode)
2338{
2339 PyObject *unicode, *copy;
2340 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002341 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002342 unsigned int kind;
2343
2344 assert(p_unicode != NULL);
2345 unicode = *p_unicode;
2346 assert(PyUnicode_IS_READY(unicode));
2347 if (PyUnicode_IS_ASCII(unicode))
2348 return;
2349
2350 len = PyUnicode_GET_LENGTH(unicode);
2351 kind = PyUnicode_KIND(unicode);
2352 if (kind == PyUnicode_1BYTE_KIND) {
2353 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002354 max_char = ucs1lib_find_max_char(u, u + len);
2355 if (max_char >= 128)
2356 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002357 }
2358 else if (kind == PyUnicode_2BYTE_KIND) {
2359 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002360 max_char = ucs2lib_find_max_char(u, u + len);
2361 if (max_char >= 256)
2362 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002363 }
2364 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002365 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002366 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002367 max_char = ucs4lib_find_max_char(u, u + len);
2368 if (max_char >= 0x10000)
2369 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002370 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002371 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002372 if (copy != NULL)
2373 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002374 Py_DECREF(unicode);
2375 *p_unicode = copy;
2376}
2377
Victor Stinner034f6cf2011-09-30 02:26:44 +02002378PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002379_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002380{
Victor Stinner87af4f22011-11-21 23:03:47 +01002381 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002382 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002383
Victor Stinner034f6cf2011-09-30 02:26:44 +02002384 if (!PyUnicode_Check(unicode)) {
2385 PyErr_BadInternalCall();
2386 return NULL;
2387 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002388 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002389 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002390
Victor Stinner87af4f22011-11-21 23:03:47 +01002391 length = PyUnicode_GET_LENGTH(unicode);
2392 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002393 if (!copy)
2394 return NULL;
2395 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2396
Christian Heimesf051e432016-09-13 20:22:02 +02002397 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002398 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002399 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002400 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002401}
2402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002403
Victor Stinnerbc603d12011-10-02 01:00:40 +02002404/* Widen Unicode objects to larger buffers. Don't write terminating null
2405 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002406
2407void*
2408_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2409{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002410 Py_ssize_t len;
2411 void *result;
2412 unsigned int skind;
2413
Benjamin Petersonbac79492012-01-14 13:34:47 -05002414 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002415 return NULL;
2416
2417 len = PyUnicode_GET_LENGTH(s);
2418 skind = PyUnicode_KIND(s);
2419 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002420 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002421 return NULL;
2422 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002423 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002424 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002425 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002426 if (!result)
2427 return PyErr_NoMemory();
2428 assert(skind == PyUnicode_1BYTE_KIND);
2429 _PyUnicode_CONVERT_BYTES(
2430 Py_UCS1, Py_UCS2,
2431 PyUnicode_1BYTE_DATA(s),
2432 PyUnicode_1BYTE_DATA(s) + len,
2433 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002434 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002435 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002436 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002437 if (!result)
2438 return PyErr_NoMemory();
2439 if (skind == PyUnicode_2BYTE_KIND) {
2440 _PyUnicode_CONVERT_BYTES(
2441 Py_UCS2, Py_UCS4,
2442 PyUnicode_2BYTE_DATA(s),
2443 PyUnicode_2BYTE_DATA(s) + len,
2444 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002446 else {
2447 assert(skind == PyUnicode_1BYTE_KIND);
2448 _PyUnicode_CONVERT_BYTES(
2449 Py_UCS1, Py_UCS4,
2450 PyUnicode_1BYTE_DATA(s),
2451 PyUnicode_1BYTE_DATA(s) + len,
2452 result);
2453 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002455 default:
2456 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002457 }
Victor Stinner01698042011-10-04 00:04:26 +02002458 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002459 return NULL;
2460}
2461
2462static Py_UCS4*
2463as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2464 int copy_null)
2465{
2466 int kind;
2467 void *data;
2468 Py_ssize_t len, targetlen;
2469 if (PyUnicode_READY(string) == -1)
2470 return NULL;
2471 kind = PyUnicode_KIND(string);
2472 data = PyUnicode_DATA(string);
2473 len = PyUnicode_GET_LENGTH(string);
2474 targetlen = len;
2475 if (copy_null)
2476 targetlen++;
2477 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002478 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002479 if (!target) {
2480 PyErr_NoMemory();
2481 return NULL;
2482 }
2483 }
2484 else {
2485 if (targetsize < targetlen) {
2486 PyErr_Format(PyExc_SystemError,
2487 "string is longer than the buffer");
2488 if (copy_null && 0 < targetsize)
2489 target[0] = 0;
2490 return NULL;
2491 }
2492 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002493 if (kind == PyUnicode_1BYTE_KIND) {
2494 Py_UCS1 *start = (Py_UCS1 *) data;
2495 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002496 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002497 else if (kind == PyUnicode_2BYTE_KIND) {
2498 Py_UCS2 *start = (Py_UCS2 *) data;
2499 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2500 }
2501 else {
2502 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002503 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002504 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002505 if (copy_null)
2506 target[len] = 0;
2507 return target;
2508}
2509
2510Py_UCS4*
2511PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2512 int copy_null)
2513{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002514 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002515 PyErr_BadInternalCall();
2516 return NULL;
2517 }
2518 return as_ucs4(string, target, targetsize, copy_null);
2519}
2520
2521Py_UCS4*
2522PyUnicode_AsUCS4Copy(PyObject *string)
2523{
2524 return as_ucs4(string, NULL, 0, 1);
2525}
2526
Victor Stinner15a11362012-10-06 23:48:20 +02002527/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002528 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2529 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2530#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002531
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002532static int
2533unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2534 Py_ssize_t width, Py_ssize_t precision)
2535{
2536 Py_ssize_t length, fill, arglen;
2537 Py_UCS4 maxchar;
2538
2539 if (PyUnicode_READY(str) == -1)
2540 return -1;
2541
2542 length = PyUnicode_GET_LENGTH(str);
2543 if ((precision == -1 || precision >= length)
2544 && width <= length)
2545 return _PyUnicodeWriter_WriteStr(writer, str);
2546
2547 if (precision != -1)
2548 length = Py_MIN(precision, length);
2549
2550 arglen = Py_MAX(length, width);
2551 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2552 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2553 else
2554 maxchar = writer->maxchar;
2555
2556 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2557 return -1;
2558
2559 if (width > length) {
2560 fill = width - length;
2561 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2562 return -1;
2563 writer->pos += fill;
2564 }
2565
2566 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2567 str, 0, length);
2568 writer->pos += length;
2569 return 0;
2570}
2571
2572static int
Victor Stinner998b8062018-09-12 00:23:25 +02002573unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002574 Py_ssize_t width, Py_ssize_t precision)
2575{
2576 /* UTF-8 */
2577 Py_ssize_t length;
2578 PyObject *unicode;
2579 int res;
2580
2581 length = strlen(str);
2582 if (precision != -1)
2583 length = Py_MIN(length, precision);
2584 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2585 if (unicode == NULL)
2586 return -1;
2587
2588 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2589 Py_DECREF(unicode);
2590 return res;
2591}
2592
Victor Stinner96865452011-03-01 23:44:09 +00002593static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002594unicode_fromformat_arg(_PyUnicodeWriter *writer,
2595 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002596{
Victor Stinnere215d962012-10-06 23:03:36 +02002597 const char *p;
2598 Py_ssize_t len;
2599 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002600 Py_ssize_t width;
2601 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002602 int longflag;
2603 int longlongflag;
2604 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002605 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002606
2607 p = f;
2608 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002609 zeropad = 0;
2610 if (*f == '0') {
2611 zeropad = 1;
2612 f++;
2613 }
Victor Stinner96865452011-03-01 23:44:09 +00002614
2615 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002616 width = -1;
2617 if (Py_ISDIGIT((unsigned)*f)) {
2618 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002619 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002620 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002621 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002622 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002623 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002624 return NULL;
2625 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002626 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002627 f++;
2628 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002629 }
2630 precision = -1;
2631 if (*f == '.') {
2632 f++;
2633 if (Py_ISDIGIT((unsigned)*f)) {
2634 precision = (*f - '0');
2635 f++;
2636 while (Py_ISDIGIT((unsigned)*f)) {
2637 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2638 PyErr_SetString(PyExc_ValueError,
2639 "precision too big");
2640 return NULL;
2641 }
2642 precision = (precision * 10) + (*f - '0');
2643 f++;
2644 }
2645 }
Victor Stinner96865452011-03-01 23:44:09 +00002646 if (*f == '%') {
2647 /* "%.3%s" => f points to "3" */
2648 f--;
2649 }
2650 }
2651 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002652 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002653 f--;
2654 }
Victor Stinner96865452011-03-01 23:44:09 +00002655
2656 /* Handle %ld, %lu, %lld and %llu. */
2657 longflag = 0;
2658 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002659 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002660 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002661 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002662 longflag = 1;
2663 ++f;
2664 }
Victor Stinner96865452011-03-01 23:44:09 +00002665 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002666 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002667 longlongflag = 1;
2668 f += 2;
2669 }
Victor Stinner96865452011-03-01 23:44:09 +00002670 }
2671 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002672 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002673 size_tflag = 1;
2674 ++f;
2675 }
Victor Stinnere215d962012-10-06 23:03:36 +02002676
2677 if (f[1] == '\0')
2678 writer->overallocate = 0;
2679
2680 switch (*f) {
2681 case 'c':
2682 {
2683 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002684 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002685 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002686 "character argument not in range(0x110000)");
2687 return NULL;
2688 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002689 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002690 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002691 break;
2692 }
2693
2694 case 'i':
2695 case 'd':
2696 case 'u':
2697 case 'x':
2698 {
2699 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002700 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002701 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002702
2703 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002704 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002705 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002706 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002707 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002708 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002709 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002710 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002711 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002712 va_arg(*vargs, size_t));
2713 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002714 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002715 va_arg(*vargs, unsigned int));
2716 }
2717 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002718 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002719 }
2720 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002721 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002722 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002723 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002724 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002725 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002726 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002727 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002728 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002729 va_arg(*vargs, Py_ssize_t));
2730 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002731 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002732 va_arg(*vargs, int));
2733 }
2734 assert(len >= 0);
2735
Victor Stinnere215d962012-10-06 23:03:36 +02002736 if (precision < len)
2737 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002738
2739 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002740 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2741 return NULL;
2742
Victor Stinnere215d962012-10-06 23:03:36 +02002743 if (width > precision) {
2744 Py_UCS4 fillchar;
2745 fill = width - precision;
2746 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002747 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2748 return NULL;
2749 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002750 }
Victor Stinner15a11362012-10-06 23:48:20 +02002751 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002752 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002753 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2754 return NULL;
2755 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002756 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002757
Victor Stinner4a587072013-11-19 12:54:53 +01002758 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2759 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002760 break;
2761 }
2762
2763 case 'p':
2764 {
2765 char number[MAX_LONG_LONG_CHARS];
2766
2767 len = sprintf(number, "%p", va_arg(*vargs, void*));
2768 assert(len >= 0);
2769
2770 /* %p is ill-defined: ensure leading 0x. */
2771 if (number[1] == 'X')
2772 number[1] = 'x';
2773 else if (number[1] != 'x') {
2774 memmove(number + 2, number,
2775 strlen(number) + 1);
2776 number[0] = '0';
2777 number[1] = 'x';
2778 len += 2;
2779 }
2780
Victor Stinner4a587072013-11-19 12:54:53 +01002781 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002782 return NULL;
2783 break;
2784 }
2785
2786 case 's':
2787 {
2788 /* UTF-8 */
2789 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002790 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002791 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002792 break;
2793 }
2794
2795 case 'U':
2796 {
2797 PyObject *obj = va_arg(*vargs, PyObject *);
2798 assert(obj && _PyUnicode_CHECK(obj));
2799
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002800 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002801 return NULL;
2802 break;
2803 }
2804
2805 case 'V':
2806 {
2807 PyObject *obj = va_arg(*vargs, PyObject *);
2808 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002809 if (obj) {
2810 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002811 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002812 return NULL;
2813 }
2814 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002815 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002816 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002817 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002818 }
2819 break;
2820 }
2821
2822 case 'S':
2823 {
2824 PyObject *obj = va_arg(*vargs, PyObject *);
2825 PyObject *str;
2826 assert(obj);
2827 str = PyObject_Str(obj);
2828 if (!str)
2829 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002830 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002831 Py_DECREF(str);
2832 return NULL;
2833 }
2834 Py_DECREF(str);
2835 break;
2836 }
2837
2838 case 'R':
2839 {
2840 PyObject *obj = va_arg(*vargs, PyObject *);
2841 PyObject *repr;
2842 assert(obj);
2843 repr = PyObject_Repr(obj);
2844 if (!repr)
2845 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002846 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002847 Py_DECREF(repr);
2848 return NULL;
2849 }
2850 Py_DECREF(repr);
2851 break;
2852 }
2853
2854 case 'A':
2855 {
2856 PyObject *obj = va_arg(*vargs, PyObject *);
2857 PyObject *ascii;
2858 assert(obj);
2859 ascii = PyObject_ASCII(obj);
2860 if (!ascii)
2861 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002862 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002863 Py_DECREF(ascii);
2864 return NULL;
2865 }
2866 Py_DECREF(ascii);
2867 break;
2868 }
2869
2870 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002871 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002872 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002873 break;
2874
2875 default:
2876 /* if we stumble upon an unknown formatting code, copy the rest
2877 of the format string to the output string. (we cannot just
2878 skip the code, since there's no way to know what's in the
2879 argument list) */
2880 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002881 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002882 return NULL;
2883 f = p+len;
2884 return f;
2885 }
2886
2887 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002888 return f;
2889}
2890
Walter Dörwaldd2034312007-05-18 16:29:38 +00002891PyObject *
2892PyUnicode_FromFormatV(const char *format, va_list vargs)
2893{
Victor Stinnere215d962012-10-06 23:03:36 +02002894 va_list vargs2;
2895 const char *f;
2896 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002897
Victor Stinner8f674cc2013-04-17 23:02:17 +02002898 _PyUnicodeWriter_Init(&writer);
2899 writer.min_length = strlen(format) + 100;
2900 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002901
Benjamin Peterson0c212142016-09-20 20:39:33 -07002902 // Copy varags to be able to pass a reference to a subfunction.
2903 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002904
2905 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002906 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002907 f = unicode_fromformat_arg(&writer, f, &vargs2);
2908 if (f == NULL)
2909 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002910 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002911 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002912 const char *p;
2913 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002914
Victor Stinnere215d962012-10-06 23:03:36 +02002915 p = f;
2916 do
2917 {
2918 if ((unsigned char)*p > 127) {
2919 PyErr_Format(PyExc_ValueError,
2920 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2921 "string, got a non-ASCII byte: 0x%02x",
2922 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002923 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002924 }
2925 p++;
2926 }
2927 while (*p != '\0' && *p != '%');
2928 len = p - f;
2929
2930 if (*p == '\0')
2931 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002932
2933 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002934 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002935
2936 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002937 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002938 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002939 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002940 return _PyUnicodeWriter_Finish(&writer);
2941
2942 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002943 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002944 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002945 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002946}
2947
Walter Dörwaldd2034312007-05-18 16:29:38 +00002948PyObject *
2949PyUnicode_FromFormat(const char *format, ...)
2950{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002951 PyObject* ret;
2952 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002953
2954#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002955 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002956#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002957 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002958#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002959 ret = PyUnicode_FromFormatV(format, vargs);
2960 va_end(vargs);
2961 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002962}
2963
Serhiy Storchakac46db922018-10-23 22:58:24 +03002964static Py_ssize_t
2965unicode_get_widechar_size(PyObject *unicode)
2966{
2967 Py_ssize_t res;
2968
2969 assert(unicode != NULL);
2970 assert(_PyUnicode_CHECK(unicode));
2971
2972 if (_PyUnicode_WSTR(unicode) != NULL) {
2973 return PyUnicode_WSTR_LENGTH(unicode);
2974 }
2975 assert(PyUnicode_IS_READY(unicode));
2976
2977 res = _PyUnicode_LENGTH(unicode);
2978#if SIZEOF_WCHAR_T == 2
2979 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
2980 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
2981 const Py_UCS4 *end = s + res;
2982 for (; s < end; ++s) {
2983 if (*s > 0xFFFF) {
2984 ++res;
2985 }
2986 }
2987 }
2988#endif
2989 return res;
2990}
2991
2992static void
2993unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
2994{
2995 const wchar_t *wstr;
2996
2997 assert(unicode != NULL);
2998 assert(_PyUnicode_CHECK(unicode));
2999
3000 wstr = _PyUnicode_WSTR(unicode);
3001 if (wstr != NULL) {
3002 memcpy(w, wstr, size * sizeof(wchar_t));
3003 return;
3004 }
3005 assert(PyUnicode_IS_READY(unicode));
3006
3007 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3008 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3009 for (; size--; ++s, ++w) {
3010 *w = *s;
3011 }
3012 }
3013 else {
3014#if SIZEOF_WCHAR_T == 4
3015 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3016 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3017 for (; size--; ++s, ++w) {
3018 *w = *s;
3019 }
3020#else
3021 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3022 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3023 for (; size--; ++s, ++w) {
3024 Py_UCS4 ch = *s;
3025 if (ch > 0xFFFF) {
3026 assert(ch <= MAX_UNICODE);
3027 /* encode surrogate pair in this case */
3028 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3029 if (!size--)
3030 break;
3031 *w = Py_UNICODE_LOW_SURROGATE(ch);
3032 }
3033 else {
3034 *w = ch;
3035 }
3036 }
3037#endif
3038 }
3039}
3040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003041#ifdef HAVE_WCHAR_H
3042
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003043/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003044
Victor Stinnerd88d9832011-09-06 02:00:05 +02003045 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003046 character) required to convert the unicode object. Ignore size argument.
3047
Victor Stinnerd88d9832011-09-06 02:00:05 +02003048 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003049 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003050 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003051Py_ssize_t
3052PyUnicode_AsWideChar(PyObject *unicode,
3053 wchar_t *w,
3054 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003055{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003056 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003057
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003058 if (unicode == NULL) {
3059 PyErr_BadInternalCall();
3060 return -1;
3061 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003062 if (!PyUnicode_Check(unicode)) {
3063 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003064 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003065 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003066
3067 res = unicode_get_widechar_size(unicode);
3068 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003069 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003070 }
3071
3072 if (size > res) {
3073 size = res + 1;
3074 }
3075 else {
3076 res = size;
3077 }
3078 unicode_copy_as_widechar(unicode, w, size);
3079 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003080}
3081
Victor Stinner137c34c2010-09-29 10:25:54 +00003082wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003083PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003084 Py_ssize_t *size)
3085{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003086 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003087 Py_ssize_t buflen;
3088
3089 if (unicode == NULL) {
3090 PyErr_BadInternalCall();
3091 return NULL;
3092 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003093 if (!PyUnicode_Check(unicode)) {
3094 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003095 return NULL;
3096 }
3097
Serhiy Storchakac46db922018-10-23 22:58:24 +03003098 buflen = unicode_get_widechar_size(unicode);
3099 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003100 if (buffer == NULL) {
3101 PyErr_NoMemory();
3102 return NULL;
3103 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003104 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3105 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003106 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003107 }
3108 else if (wcslen(buffer) != (size_t)buflen) {
3109 PyMem_FREE(buffer);
3110 PyErr_SetString(PyExc_ValueError,
3111 "embedded null character");
3112 return NULL;
3113 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003114 return buffer;
3115}
3116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003117#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003118
Alexander Belopolsky40018472011-02-26 01:02:56 +00003119PyObject *
3120PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003121{
Victor Stinner8faf8212011-12-08 22:14:11 +01003122 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003123 PyErr_SetString(PyExc_ValueError,
3124 "chr() arg not in range(0x110000)");
3125 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003126 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003127
Victor Stinner985a82a2014-01-03 12:53:47 +01003128 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003129}
3130
Alexander Belopolsky40018472011-02-26 01:02:56 +00003131PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003132PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003133{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003134 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003135 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003136 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003137 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003138 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003139 Py_INCREF(obj);
3140 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003141 }
3142 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003143 /* For a Unicode subtype that's not a Unicode object,
3144 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003145 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003146 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003147 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003148 "Can't convert '%.100s' object to str implicitly",
3149 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003150 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003151}
3152
Alexander Belopolsky40018472011-02-26 01:02:56 +00003153PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003154PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003155 const char *encoding,
3156 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003157{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003158 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003159 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003160
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003162 PyErr_BadInternalCall();
3163 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003164 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003165
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003166 /* Decoding bytes objects is the most common case and should be fast */
3167 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003168 if (PyBytes_GET_SIZE(obj) == 0)
3169 _Py_RETURN_UNICODE_EMPTY();
3170 v = PyUnicode_Decode(
3171 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3172 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003173 return v;
3174 }
3175
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003176 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003177 PyErr_SetString(PyExc_TypeError,
3178 "decoding str is not supported");
3179 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003180 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003181
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003182 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3183 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3184 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003185 "decoding to str: need a bytes-like object, %.80s found",
3186 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003187 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003188 }
Tim Petersced69f82003-09-16 20:30:58 +00003189
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003190 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003191 PyBuffer_Release(&buffer);
3192 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003193 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003194
Serhiy Storchaka05997252013-01-26 12:14:02 +02003195 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003196 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003197 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003198}
3199
Victor Stinnerebe17e02016-10-12 13:57:45 +02003200/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3201 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3202 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003203int
3204_Py_normalize_encoding(const char *encoding,
3205 char *lower,
3206 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003208 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003209 char *l;
3210 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003211 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003212
Victor Stinner942889a2016-09-05 15:40:10 -07003213 assert(encoding != NULL);
3214
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003215 e = encoding;
3216 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003217 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003218 punct = 0;
3219 while (1) {
3220 char c = *e;
3221 if (c == 0) {
3222 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003223 }
Victor Stinner942889a2016-09-05 15:40:10 -07003224
3225 if (Py_ISALNUM(c) || c == '.') {
3226 if (punct && l != lower) {
3227 if (l == l_end) {
3228 return 0;
3229 }
3230 *l++ = '_';
3231 }
3232 punct = 0;
3233
3234 if (l == l_end) {
3235 return 0;
3236 }
3237 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003238 }
3239 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003240 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003241 }
Victor Stinner942889a2016-09-05 15:40:10 -07003242
3243 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003244 }
3245 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003246 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003247}
3248
Alexander Belopolsky40018472011-02-26 01:02:56 +00003249PyObject *
3250PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003251 Py_ssize_t size,
3252 const char *encoding,
3253 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003254{
3255 PyObject *buffer = NULL, *unicode;
3256 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003257 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3258
3259 if (encoding == NULL) {
3260 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3261 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003262
Fred Drakee4315f52000-05-09 19:53:39 +00003263 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003264 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3265 char *lower = buflower;
3266
3267 /* Fast paths */
3268 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3269 lower += 3;
3270 if (*lower == '_') {
3271 /* Match "utf8" and "utf_8" */
3272 lower++;
3273 }
3274
3275 if (lower[0] == '8' && lower[1] == 0) {
3276 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3277 }
3278 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3279 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3280 }
3281 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3282 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3283 }
3284 }
3285 else {
3286 if (strcmp(lower, "ascii") == 0
3287 || strcmp(lower, "us_ascii") == 0) {
3288 return PyUnicode_DecodeASCII(s, size, errors);
3289 }
Steve Dowercc16be82016-09-08 10:35:16 -07003290 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003291 else if (strcmp(lower, "mbcs") == 0) {
3292 return PyUnicode_DecodeMBCS(s, size, errors);
3293 }
3294 #endif
3295 else if (strcmp(lower, "latin1") == 0
3296 || strcmp(lower, "latin_1") == 0
3297 || strcmp(lower, "iso_8859_1") == 0
3298 || strcmp(lower, "iso8859_1") == 0) {
3299 return PyUnicode_DecodeLatin1(s, size, errors);
3300 }
3301 }
Victor Stinner37296e82010-06-10 13:36:23 +00003302 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303
3304 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003305 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003306 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003307 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003308 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309 if (buffer == NULL)
3310 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003311 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312 if (unicode == NULL)
3313 goto onError;
3314 if (!PyUnicode_Check(unicode)) {
3315 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003316 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003317 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003318 encoding,
3319 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003320 Py_DECREF(unicode);
3321 goto onError;
3322 }
3323 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003324 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003325
Benjamin Peterson29060642009-01-31 22:14:21 +00003326 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327 Py_XDECREF(buffer);
3328 return NULL;
3329}
3330
Alexander Belopolsky40018472011-02-26 01:02:56 +00003331PyObject *
3332PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003333 const char *encoding,
3334 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003335{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003336 if (!PyUnicode_Check(unicode)) {
3337 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003338 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003339 }
3340
Serhiy Storchaka00939072016-10-27 21:05:49 +03003341 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3342 "PyUnicode_AsDecodedObject() is deprecated; "
3343 "use PyCodec_Decode() to decode from str", 1) < 0)
3344 return NULL;
3345
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003346 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003347 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003348
3349 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003350 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003351}
3352
Alexander Belopolsky40018472011-02-26 01:02:56 +00003353PyObject *
3354PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003355 const char *encoding,
3356 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003357{
3358 PyObject *v;
3359
3360 if (!PyUnicode_Check(unicode)) {
3361 PyErr_BadArgument();
3362 goto onError;
3363 }
3364
Serhiy Storchaka00939072016-10-27 21:05:49 +03003365 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3366 "PyUnicode_AsDecodedUnicode() is deprecated; "
3367 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3368 return NULL;
3369
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003370 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003371 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003372
3373 /* Decode via the codec registry */
3374 v = PyCodec_Decode(unicode, encoding, errors);
3375 if (v == NULL)
3376 goto onError;
3377 if (!PyUnicode_Check(v)) {
3378 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003379 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003380 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003381 encoding,
3382 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003383 Py_DECREF(v);
3384 goto onError;
3385 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003386 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003387
Benjamin Peterson29060642009-01-31 22:14:21 +00003388 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003389 return NULL;
3390}
3391
Alexander Belopolsky40018472011-02-26 01:02:56 +00003392PyObject *
3393PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003394 Py_ssize_t size,
3395 const char *encoding,
3396 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003397{
3398 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003399
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003400 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003401 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003402 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3404 Py_DECREF(unicode);
3405 return v;
3406}
3407
Alexander Belopolsky40018472011-02-26 01:02:56 +00003408PyObject *
3409PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003410 const char *encoding,
3411 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003412{
3413 PyObject *v;
3414
3415 if (!PyUnicode_Check(unicode)) {
3416 PyErr_BadArgument();
3417 goto onError;
3418 }
3419
Serhiy Storchaka00939072016-10-27 21:05:49 +03003420 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3421 "PyUnicode_AsEncodedObject() is deprecated; "
3422 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3423 "or PyCodec_Encode() for generic encoding", 1) < 0)
3424 return NULL;
3425
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003426 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003427 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003428
3429 /* Encode via the codec registry */
3430 v = PyCodec_Encode(unicode, encoding, errors);
3431 if (v == NULL)
3432 goto onError;
3433 return v;
3434
Benjamin Peterson29060642009-01-31 22:14:21 +00003435 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003436 return NULL;
3437}
3438
Victor Stinner1b579672011-12-17 05:47:23 +01003439
Victor Stinner2cba6b82018-01-10 22:46:15 +01003440static PyObject *
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003441unicode_encode_locale(PyObject *unicode, const char *errors,
3442 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003443{
Victor Stinner3d4226a2018-08-29 22:21:32 +02003444 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003445
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003446 Py_ssize_t wlen;
3447 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3448 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003449 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003450 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003451
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003452 Py_ssize_t wlen2 = wcslen(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003453 if (wlen2 != wlen) {
3454 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003455 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003456 return NULL;
3457 }
3458
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003459 char *str;
3460 size_t error_pos;
3461 const char *reason;
3462 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003463 current_locale, error_handler);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003464 if (res != 0) {
3465 if (res == -2) {
3466 PyObject *exc;
3467 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3468 "locale", unicode,
3469 (Py_ssize_t)error_pos,
3470 (Py_ssize_t)(error_pos+1),
3471 reason);
3472 if (exc != NULL) {
3473 PyCodec_StrictErrors(exc);
3474 Py_DECREF(exc);
3475 }
3476 return NULL;
Victor Stinner2cba6b82018-01-10 22:46:15 +01003477 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003478 else if (res == -3) {
3479 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3480 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003481 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003482 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003483 PyMem_Free(wstr);
3484 return NULL;
3485 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003486 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003487 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003488
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003489 PyObject *bytes = PyBytes_FromString(str);
3490 PyMem_RawFree(str);
3491 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003492}
3493
Victor Stinnerad158722010-10-27 00:25:46 +00003494PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003495PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3496{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003497 return unicode_encode_locale(unicode, errors, 1);
3498}
3499
3500PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003501PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003502{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003503 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003504 const _PyCoreConfig *config = &interp->core_config;
3505#if defined(__APPLE__)
3506 return _PyUnicode_AsUTF8String(unicode, config->filesystem_errors);
3507#else
Victor Stinner793b5312011-04-27 00:24:21 +02003508 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3509 cannot use it to encode and decode filenames before it is loaded. Load
3510 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003511 implementation of the locale codec until the codec registry is
3512 initialized and the Python codec is loaded. See initfsencoding(). */
3513 if (interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003514 return PyUnicode_AsEncodedString(unicode,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003515 config->filesystem_encoding,
3516 config->filesystem_errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003517 }
3518 else {
Victor Stinner2cba6b82018-01-10 22:46:15 +01003519 return unicode_encode_locale(unicode,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003520 config->filesystem_errors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003521 }
Victor Stinnerad158722010-10-27 00:25:46 +00003522#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003523}
3524
Alexander Belopolsky40018472011-02-26 01:02:56 +00003525PyObject *
3526PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003527 const char *encoding,
3528 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003529{
3530 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003531 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003532
Guido van Rossumd57fd912000-03-10 22:53:23 +00003533 if (!PyUnicode_Check(unicode)) {
3534 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003535 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003536 }
Fred Drakee4315f52000-05-09 19:53:39 +00003537
Victor Stinner942889a2016-09-05 15:40:10 -07003538 if (encoding == NULL) {
3539 return _PyUnicode_AsUTF8String(unicode, errors);
3540 }
3541
Fred Drakee4315f52000-05-09 19:53:39 +00003542 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003543 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3544 char *lower = buflower;
3545
3546 /* Fast paths */
3547 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3548 lower += 3;
3549 if (*lower == '_') {
3550 /* Match "utf8" and "utf_8" */
3551 lower++;
3552 }
3553
3554 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003555 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003556 }
3557 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3558 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3559 }
3560 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3561 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3562 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003563 }
Victor Stinner942889a2016-09-05 15:40:10 -07003564 else {
3565 if (strcmp(lower, "ascii") == 0
3566 || strcmp(lower, "us_ascii") == 0) {
3567 return _PyUnicode_AsASCIIString(unicode, errors);
3568 }
Steve Dowercc16be82016-09-08 10:35:16 -07003569#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003570 else if (strcmp(lower, "mbcs") == 0) {
3571 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3572 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003573#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003574 else if (strcmp(lower, "latin1") == 0 ||
3575 strcmp(lower, "latin_1") == 0 ||
3576 strcmp(lower, "iso_8859_1") == 0 ||
3577 strcmp(lower, "iso8859_1") == 0) {
3578 return _PyUnicode_AsLatin1String(unicode, errors);
3579 }
3580 }
Victor Stinner37296e82010-06-10 13:36:23 +00003581 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003582
3583 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003584 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003585 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003586 return NULL;
3587
3588 /* The normal path */
3589 if (PyBytes_Check(v))
3590 return v;
3591
3592 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003593 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003594 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003595 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003596
3597 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003598 "encoder %s returned bytearray instead of bytes; "
3599 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003600 encoding);
3601 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003602 Py_DECREF(v);
3603 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003604 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003605
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003606 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3607 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003608 Py_DECREF(v);
3609 return b;
3610 }
3611
3612 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003613 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003614 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003615 encoding,
3616 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003617 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003618 return NULL;
3619}
3620
Alexander Belopolsky40018472011-02-26 01:02:56 +00003621PyObject *
3622PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003623 const char *encoding,
3624 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003625{
3626 PyObject *v;
3627
3628 if (!PyUnicode_Check(unicode)) {
3629 PyErr_BadArgument();
3630 goto onError;
3631 }
3632
Serhiy Storchaka00939072016-10-27 21:05:49 +03003633 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3634 "PyUnicode_AsEncodedUnicode() is deprecated; "
3635 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3636 return NULL;
3637
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003638 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003639 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003640
3641 /* Encode via the codec registry */
3642 v = PyCodec_Encode(unicode, encoding, errors);
3643 if (v == NULL)
3644 goto onError;
3645 if (!PyUnicode_Check(v)) {
3646 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003647 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003648 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003649 encoding,
3650 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003651 Py_DECREF(v);
3652 goto onError;
3653 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003654 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003655
Benjamin Peterson29060642009-01-31 22:14:21 +00003656 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003657 return NULL;
3658}
3659
Victor Stinner2cba6b82018-01-10 22:46:15 +01003660static PyObject*
3661unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
3662 int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003663{
Victor Stinner3d4226a2018-08-29 22:21:32 +02003664 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003665
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003666 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3667 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003668 return NULL;
3669 }
3670
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003671 wchar_t *wstr;
3672 size_t wlen;
3673 const char *reason;
3674 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003675 current_locale, error_handler);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003676 if (res != 0) {
3677 if (res == -2) {
3678 PyObject *exc;
3679 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3680 "locale", str, len,
3681 (Py_ssize_t)wlen,
3682 (Py_ssize_t)(wlen + 1),
3683 reason);
3684 if (exc != NULL) {
3685 PyCodec_StrictErrors(exc);
3686 Py_DECREF(exc);
3687 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003688 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003689 else if (res == -3) {
3690 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3691 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003692 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003693 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003694 }
Victor Stinner2f197072011-12-17 07:08:30 +01003695 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003696 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003697
3698 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3699 PyMem_RawFree(wstr);
3700 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003701}
3702
3703PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003704PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3705 const char *errors)
3706{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003707 return unicode_decode_locale(str, len, errors, 1);
3708}
3709
3710PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003711PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003712{
3713 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003714 return unicode_decode_locale(str, size, errors, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003715}
3716
3717
3718PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003719PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003720 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003721 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3722}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003723
Christian Heimes5894ba72007-11-04 11:43:14 +00003724PyObject*
3725PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3726{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003727 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003728 const _PyCoreConfig *config = &interp->core_config;
3729#if defined(__APPLE__)
3730 return PyUnicode_DecodeUTF8Stateful(s, size, config->filesystem_errors, NULL);
3731#else
Victor Stinner793b5312011-04-27 00:24:21 +02003732 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3733 cannot use it to encode and decode filenames before it is loaded. Load
3734 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003735 implementation of the locale codec until the codec registry is
3736 initialized and the Python codec is loaded. See initfsencoding(). */
3737 if (interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003738 return PyUnicode_Decode(s, size,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003739 config->filesystem_encoding,
3740 config->filesystem_errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003741 }
3742 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003743 return unicode_decode_locale(s, size,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003744 config->filesystem_errors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003745 }
Victor Stinnerad158722010-10-27 00:25:46 +00003746#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003747}
3748
Martin v. Löwis011e8422009-05-05 04:43:17 +00003749
3750int
3751PyUnicode_FSConverter(PyObject* arg, void* addr)
3752{
Brett Cannonec6ce872016-09-06 15:50:29 -07003753 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003754 PyObject *output = NULL;
3755 Py_ssize_t size;
3756 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003757 if (arg == NULL) {
3758 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003759 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003760 return 1;
3761 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003762 path = PyOS_FSPath(arg);
3763 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003764 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003765 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003766 if (PyBytes_Check(path)) {
3767 output = path;
3768 }
3769 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3770 output = PyUnicode_EncodeFSDefault(path);
3771 Py_DECREF(path);
3772 if (!output) {
3773 return 0;
3774 }
3775 assert(PyBytes_Check(output));
3776 }
3777
Victor Stinner0ea2a462010-04-30 00:22:08 +00003778 size = PyBytes_GET_SIZE(output);
3779 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003780 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003781 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003782 Py_DECREF(output);
3783 return 0;
3784 }
3785 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003786 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003787}
3788
3789
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003790int
3791PyUnicode_FSDecoder(PyObject* arg, void* addr)
3792{
Brett Cannona5711202016-09-06 19:36:01 -07003793 int is_buffer = 0;
3794 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003795 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003796 if (arg == NULL) {
3797 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003798 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003799 return 1;
3800 }
Brett Cannona5711202016-09-06 19:36:01 -07003801
3802 is_buffer = PyObject_CheckBuffer(arg);
3803 if (!is_buffer) {
3804 path = PyOS_FSPath(arg);
3805 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003806 return 0;
3807 }
Brett Cannona5711202016-09-06 19:36:01 -07003808 }
3809 else {
3810 path = arg;
3811 Py_INCREF(arg);
3812 }
3813
3814 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003815 output = path;
3816 }
3817 else if (PyBytes_Check(path) || is_buffer) {
3818 PyObject *path_bytes = NULL;
3819
3820 if (!PyBytes_Check(path) &&
3821 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003822 "path should be string, bytes, or os.PathLike, not %.200s",
3823 Py_TYPE(arg)->tp_name)) {
3824 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003825 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003826 }
3827 path_bytes = PyBytes_FromObject(path);
3828 Py_DECREF(path);
3829 if (!path_bytes) {
3830 return 0;
3831 }
3832 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3833 PyBytes_GET_SIZE(path_bytes));
3834 Py_DECREF(path_bytes);
3835 if (!output) {
3836 return 0;
3837 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003838 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003839 else {
3840 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003841 "path should be string, bytes, or os.PathLike, not %.200s",
3842 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003843 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003844 return 0;
3845 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003846 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003847 Py_DECREF(output);
3848 return 0;
3849 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003850 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003851 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003852 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003853 Py_DECREF(output);
3854 return 0;
3855 }
3856 *(PyObject**)addr = output;
3857 return Py_CLEANUP_SUPPORTED;
3858}
3859
3860
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003861const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003862PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003863{
Christian Heimesf3863112007-11-22 07:46:41 +00003864 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003865
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003866 if (!PyUnicode_Check(unicode)) {
3867 PyErr_BadArgument();
3868 return NULL;
3869 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003870 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003871 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003872
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003873 if (PyUnicode_UTF8(unicode) == NULL) {
3874 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003875 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003876 if (bytes == NULL)
3877 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003878 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3879 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003880 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003881 Py_DECREF(bytes);
3882 return NULL;
3883 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003884 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003885 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003886 PyBytes_AS_STRING(bytes),
3887 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003888 Py_DECREF(bytes);
3889 }
3890
3891 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003892 *psize = PyUnicode_UTF8_LENGTH(unicode);
3893 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003894}
3895
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003896const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003897PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003898{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003899 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3900}
3901
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003902Py_UNICODE *
3903PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3904{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003905 if (!PyUnicode_Check(unicode)) {
3906 PyErr_BadArgument();
3907 return NULL;
3908 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003909 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
3910 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003911 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003912 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003913 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003914
Serhiy Storchakac46db922018-10-23 22:58:24 +03003915 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
3916 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3917 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003918 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003919 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003920 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
3921 if (w == NULL) {
3922 PyErr_NoMemory();
3923 return NULL;
3924 }
3925 unicode_copy_as_widechar(unicode, w, wlen + 1);
3926 _PyUnicode_WSTR(unicode) = w;
3927 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
3928 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003929 }
3930 }
3931 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003932 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003933 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00003934}
3935
Alexander Belopolsky40018472011-02-26 01:02:56 +00003936Py_UNICODE *
3937PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003938{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003939 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003940}
3941
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03003942const Py_UNICODE *
3943_PyUnicode_AsUnicode(PyObject *unicode)
3944{
3945 Py_ssize_t size;
3946 const Py_UNICODE *wstr;
3947
3948 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
3949 if (wstr && wcslen(wstr) != (size_t)size) {
3950 PyErr_SetString(PyExc_ValueError, "embedded null character");
3951 return NULL;
3952 }
3953 return wstr;
3954}
3955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003956
Alexander Belopolsky40018472011-02-26 01:02:56 +00003957Py_ssize_t
3958PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003959{
3960 if (!PyUnicode_Check(unicode)) {
3961 PyErr_BadArgument();
3962 goto onError;
3963 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003964 if (_PyUnicode_WSTR(unicode) == NULL) {
3965 if (PyUnicode_AsUnicode(unicode) == NULL)
3966 goto onError;
3967 }
3968 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003969
Benjamin Peterson29060642009-01-31 22:14:21 +00003970 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003971 return -1;
3972}
3973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003974Py_ssize_t
3975PyUnicode_GetLength(PyObject *unicode)
3976{
Victor Stinner07621332012-06-16 04:53:46 +02003977 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003978 PyErr_BadArgument();
3979 return -1;
3980 }
Victor Stinner07621332012-06-16 04:53:46 +02003981 if (PyUnicode_READY(unicode) == -1)
3982 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003983 return PyUnicode_GET_LENGTH(unicode);
3984}
3985
3986Py_UCS4
3987PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3988{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003989 void *data;
3990 int kind;
3991
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03003992 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003993 PyErr_BadArgument();
3994 return (Py_UCS4)-1;
3995 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03003996 if (PyUnicode_READY(unicode) == -1) {
3997 return (Py_UCS4)-1;
3998 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003999 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004000 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004001 return (Py_UCS4)-1;
4002 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004003 data = PyUnicode_DATA(unicode);
4004 kind = PyUnicode_KIND(unicode);
4005 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004006}
4007
4008int
4009PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4010{
4011 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004012 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004013 return -1;
4014 }
Victor Stinner488fa492011-12-12 00:01:39 +01004015 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004016 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004017 PyErr_SetString(PyExc_IndexError, "string index out of range");
4018 return -1;
4019 }
Victor Stinner488fa492011-12-12 00:01:39 +01004020 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004021 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004022 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4023 PyErr_SetString(PyExc_ValueError, "character out of range");
4024 return -1;
4025 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004026 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4027 index, ch);
4028 return 0;
4029}
4030
Alexander Belopolsky40018472011-02-26 01:02:56 +00004031const char *
4032PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004033{
Victor Stinner42cb4622010-09-01 19:39:01 +00004034 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004035}
4036
Victor Stinner554f3f02010-06-16 23:33:54 +00004037/* create or adjust a UnicodeDecodeError */
4038static void
4039make_decode_exception(PyObject **exceptionObject,
4040 const char *encoding,
4041 const char *input, Py_ssize_t length,
4042 Py_ssize_t startpos, Py_ssize_t endpos,
4043 const char *reason)
4044{
4045 if (*exceptionObject == NULL) {
4046 *exceptionObject = PyUnicodeDecodeError_Create(
4047 encoding, input, length, startpos, endpos, reason);
4048 }
4049 else {
4050 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4051 goto onError;
4052 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4053 goto onError;
4054 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4055 goto onError;
4056 }
4057 return;
4058
4059onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004060 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004061}
4062
Steve Dowercc16be82016-09-08 10:35:16 -07004063#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004064/* error handling callback helper:
4065 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004066 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004067 and adjust various state variables.
4068 return 0 on success, -1 on error
4069*/
4070
Alexander Belopolsky40018472011-02-26 01:02:56 +00004071static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004072unicode_decode_call_errorhandler_wchar(
4073 const char *errors, PyObject **errorHandler,
4074 const char *encoding, const char *reason,
4075 const char **input, const char **inend, Py_ssize_t *startinpos,
4076 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4077 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004078{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004079 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004080
4081 PyObject *restuple = NULL;
4082 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004083 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004084 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004085 Py_ssize_t requiredsize;
4086 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004087 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004088 wchar_t *repwstr;
4089 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004090
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004091 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4092 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004093
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004094 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004095 *errorHandler = PyCodec_LookupError(errors);
4096 if (*errorHandler == NULL)
4097 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004098 }
4099
Victor Stinner554f3f02010-06-16 23:33:54 +00004100 make_decode_exception(exceptionObject,
4101 encoding,
4102 *input, *inend - *input,
4103 *startinpos, *endinpos,
4104 reason);
4105 if (*exceptionObject == NULL)
4106 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004107
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004108 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004109 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004110 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004111 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004112 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004113 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004114 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004115 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004116 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004117
4118 /* Copy back the bytes variables, which might have been modified by the
4119 callback */
4120 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4121 if (!inputobj)
4122 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004123 *input = PyBytes_AS_STRING(inputobj);
4124 insize = PyBytes_GET_SIZE(inputobj);
4125 *inend = *input + insize;
4126 /* we can DECREF safely, as the exception has another reference,
4127 so the object won't go away. */
4128 Py_DECREF(inputobj);
4129
4130 if (newpos<0)
4131 newpos = insize+newpos;
4132 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004133 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004134 goto onError;
4135 }
4136
4137 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4138 if (repwstr == NULL)
4139 goto onError;
4140 /* need more space? (at least enough for what we
4141 have+the replacement+the rest of the string (starting
4142 at the new input position), so we won't have to check space
4143 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004144 requiredsize = *outpos;
4145 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4146 goto overflow;
4147 requiredsize += repwlen;
4148 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4149 goto overflow;
4150 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004151 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004152 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004153 requiredsize = 2*outsize;
4154 if (unicode_resize(output, requiredsize) < 0)
4155 goto onError;
4156 }
4157 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4158 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004159 *endinpos = newpos;
4160 *inptr = *input + newpos;
4161
4162 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004163 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004164 return 0;
4165
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004166 overflow:
4167 PyErr_SetString(PyExc_OverflowError,
4168 "decoded result is too long for a Python string");
4169
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004170 onError:
4171 Py_XDECREF(restuple);
4172 return -1;
4173}
Steve Dowercc16be82016-09-08 10:35:16 -07004174#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004175
4176static int
4177unicode_decode_call_errorhandler_writer(
4178 const char *errors, PyObject **errorHandler,
4179 const char *encoding, const char *reason,
4180 const char **input, const char **inend, Py_ssize_t *startinpos,
4181 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4182 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4183{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004184 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004185
4186 PyObject *restuple = NULL;
4187 PyObject *repunicode = NULL;
4188 Py_ssize_t insize;
4189 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004190 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004191 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004192 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004193 int need_to_grow = 0;
4194 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004195
4196 if (*errorHandler == NULL) {
4197 *errorHandler = PyCodec_LookupError(errors);
4198 if (*errorHandler == NULL)
4199 goto onError;
4200 }
4201
4202 make_decode_exception(exceptionObject,
4203 encoding,
4204 *input, *inend - *input,
4205 *startinpos, *endinpos,
4206 reason);
4207 if (*exceptionObject == NULL)
4208 goto onError;
4209
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004210 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004211 if (restuple == NULL)
4212 goto onError;
4213 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004214 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004215 goto onError;
4216 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004217 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004218 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004219
4220 /* Copy back the bytes variables, which might have been modified by the
4221 callback */
4222 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4223 if (!inputobj)
4224 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004225 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004226 *input = PyBytes_AS_STRING(inputobj);
4227 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004228 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004229 /* we can DECREF safely, as the exception has another reference,
4230 so the object won't go away. */
4231 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004232
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004233 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004234 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004235 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004236 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004237 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004238 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004239
Victor Stinner170ca6f2013-04-18 00:25:28 +02004240 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004241 if (replen > 1) {
4242 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004243 need_to_grow = 1;
4244 }
4245 new_inptr = *input + newpos;
4246 if (*inend - new_inptr > remain) {
4247 /* We don't know the decoding algorithm here so we make the worst
4248 assumption that one byte decodes to one unicode character.
4249 If unfortunately one byte could decode to more unicode characters,
4250 the decoder may write out-of-bound then. Is it possible for the
4251 algorithms using this function? */
4252 writer->min_length += *inend - new_inptr - remain;
4253 need_to_grow = 1;
4254 }
4255 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004256 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004257 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004258 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4259 goto onError;
4260 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004261 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004262 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004263
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004264 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004265 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004266
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004267 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004268 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004269 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004270
Benjamin Peterson29060642009-01-31 22:14:21 +00004271 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004272 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004273 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004274}
4275
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004276/* --- UTF-7 Codec -------------------------------------------------------- */
4277
Antoine Pitrou244651a2009-05-04 18:56:13 +00004278/* See RFC2152 for details. We encode conservatively and decode liberally. */
4279
4280/* Three simple macros defining base-64. */
4281
4282/* Is c a base-64 character? */
4283
4284#define IS_BASE64(c) \
4285 (((c) >= 'A' && (c) <= 'Z') || \
4286 ((c) >= 'a' && (c) <= 'z') || \
4287 ((c) >= '0' && (c) <= '9') || \
4288 (c) == '+' || (c) == '/')
4289
4290/* given that c is a base-64 character, what is its base-64 value? */
4291
4292#define FROM_BASE64(c) \
4293 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4294 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4295 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4296 (c) == '+' ? 62 : 63)
4297
4298/* What is the base-64 character of the bottom 6 bits of n? */
4299
4300#define TO_BASE64(n) \
4301 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4302
4303/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4304 * decoded as itself. We are permissive on decoding; the only ASCII
4305 * byte not decoding to itself is the + which begins a base64
4306 * string. */
4307
4308#define DECODE_DIRECT(c) \
4309 ((c) <= 127 && (c) != '+')
4310
4311/* The UTF-7 encoder treats ASCII characters differently according to
4312 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4313 * the above). See RFC2152. This array identifies these different
4314 * sets:
4315 * 0 : "Set D"
4316 * alphanumeric and '(),-./:?
4317 * 1 : "Set O"
4318 * !"#$%&*;<=>@[]^_`{|}
4319 * 2 : "whitespace"
4320 * ht nl cr sp
4321 * 3 : special (must be base64 encoded)
4322 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4323 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004324
Tim Petersced69f82003-09-16 20:30:58 +00004325static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004326char utf7_category[128] = {
4327/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4328 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4329/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4330 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4331/* sp ! " # $ % & ' ( ) * + , - . / */
4332 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4333/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4334 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4335/* @ A B C D E F G H I J K L M N O */
4336 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4337/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4338 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4339/* ` a b c d e f g h i j k l m n o */
4340 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4341/* p q r s t u v w x y z { | } ~ del */
4342 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004343};
4344
Antoine Pitrou244651a2009-05-04 18:56:13 +00004345/* ENCODE_DIRECT: this character should be encoded as itself. The
4346 * answer depends on whether we are encoding set O as itself, and also
4347 * on whether we are encoding whitespace as itself. RFC2152 makes it
4348 * clear that the answers to these questions vary between
4349 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004350
Antoine Pitrou244651a2009-05-04 18:56:13 +00004351#define ENCODE_DIRECT(c, directO, directWS) \
4352 ((c) < 128 && (c) > 0 && \
4353 ((utf7_category[(c)] == 0) || \
4354 (directWS && (utf7_category[(c)] == 2)) || \
4355 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004356
Alexander Belopolsky40018472011-02-26 01:02:56 +00004357PyObject *
4358PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004359 Py_ssize_t size,
4360 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004361{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004362 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4363}
4364
Antoine Pitrou244651a2009-05-04 18:56:13 +00004365/* The decoder. The only state we preserve is our read position,
4366 * i.e. how many characters we have consumed. So if we end in the
4367 * middle of a shift sequence we have to back off the read position
4368 * and the output to the beginning of the sequence, otherwise we lose
4369 * all the shift state (seen bits, number of bits seen, high
4370 * surrogate). */
4371
Alexander Belopolsky40018472011-02-26 01:02:56 +00004372PyObject *
4373PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004374 Py_ssize_t size,
4375 const char *errors,
4376 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004377{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004378 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004379 Py_ssize_t startinpos;
4380 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004381 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004382 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004383 const char *errmsg = "";
4384 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004385 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004386 unsigned int base64bits = 0;
4387 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004388 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004389 PyObject *errorHandler = NULL;
4390 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004391
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004392 if (size == 0) {
4393 if (consumed)
4394 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004395 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004396 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004397
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004398 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004399 _PyUnicodeWriter_Init(&writer);
4400 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004401
4402 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004403 e = s + size;
4404
4405 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004406 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004407 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004408 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004409
Antoine Pitrou244651a2009-05-04 18:56:13 +00004410 if (inShift) { /* in a base-64 section */
4411 if (IS_BASE64(ch)) { /* consume a base-64 character */
4412 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4413 base64bits += 6;
4414 s++;
4415 if (base64bits >= 16) {
4416 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004417 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004418 base64bits -= 16;
4419 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004420 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004421 if (surrogate) {
4422 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004423 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4424 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004425 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004426 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004427 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004428 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004429 }
4430 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004431 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004432 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004433 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004434 }
4435 }
Victor Stinner551ac952011-11-29 22:58:13 +01004436 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004437 /* first surrogate */
4438 surrogate = outCh;
4439 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004440 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004441 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004442 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004443 }
4444 }
4445 }
4446 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004447 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004448 if (base64bits > 0) { /* left-over bits */
4449 if (base64bits >= 6) {
4450 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004451 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004452 errmsg = "partial character in shift sequence";
4453 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004454 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004455 else {
4456 /* Some bits remain; they should be zero */
4457 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004458 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004459 errmsg = "non-zero padding bits in shift sequence";
4460 goto utf7Error;
4461 }
4462 }
4463 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004464 if (surrogate && DECODE_DIRECT(ch)) {
4465 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4466 goto onError;
4467 }
4468 surrogate = 0;
4469 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004470 /* '-' is absorbed; other terminating
4471 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004472 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004473 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004474 }
4475 }
4476 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004477 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004478 s++; /* consume '+' */
4479 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004480 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004481 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004482 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004483 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004484 else if (s < e && !IS_BASE64(*s)) {
4485 s++;
4486 errmsg = "ill-formed sequence";
4487 goto utf7Error;
4488 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004489 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004490 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004491 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004492 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004493 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004494 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004495 }
4496 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004497 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004498 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004499 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004500 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004501 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004502 else {
4503 startinpos = s-starts;
4504 s++;
4505 errmsg = "unexpected special character";
4506 goto utf7Error;
4507 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004508 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004509utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004510 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004511 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004512 errors, &errorHandler,
4513 "utf7", errmsg,
4514 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004515 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004516 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004517 }
4518
Antoine Pitrou244651a2009-05-04 18:56:13 +00004519 /* end of string */
4520
4521 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4522 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004523 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004524 if (surrogate ||
4525 (base64bits >= 6) ||
4526 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004527 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004528 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004529 errors, &errorHandler,
4530 "utf7", "unterminated shift sequence",
4531 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004532 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004533 goto onError;
4534 if (s < e)
4535 goto restart;
4536 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004537 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004538
4539 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004540 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004541 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004542 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004543 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004544 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004545 writer.kind, writer.data, shiftOutStart);
4546 Py_XDECREF(errorHandler);
4547 Py_XDECREF(exc);
4548 _PyUnicodeWriter_Dealloc(&writer);
4549 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004550 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004551 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004552 }
4553 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004554 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004555 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004556 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004557
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558 Py_XDECREF(errorHandler);
4559 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004560 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004561
Benjamin Peterson29060642009-01-31 22:14:21 +00004562 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004563 Py_XDECREF(errorHandler);
4564 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004565 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004566 return NULL;
4567}
4568
4569
Alexander Belopolsky40018472011-02-26 01:02:56 +00004570PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004571_PyUnicode_EncodeUTF7(PyObject *str,
4572 int base64SetO,
4573 int base64WhiteSpace,
4574 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004575{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004576 int kind;
4577 void *data;
4578 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004579 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004580 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004581 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004582 unsigned int base64bits = 0;
4583 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004584 char * out;
4585 char * start;
4586
Benjamin Petersonbac79492012-01-14 13:34:47 -05004587 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004588 return NULL;
4589 kind = PyUnicode_KIND(str);
4590 data = PyUnicode_DATA(str);
4591 len = PyUnicode_GET_LENGTH(str);
4592
4593 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004594 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004595
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004596 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004597 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004598 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004599 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004600 if (v == NULL)
4601 return NULL;
4602
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004603 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004604 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004605 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004606
Antoine Pitrou244651a2009-05-04 18:56:13 +00004607 if (inShift) {
4608 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4609 /* shifting out */
4610 if (base64bits) { /* output remaining bits */
4611 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4612 base64buffer = 0;
4613 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004614 }
4615 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004616 /* Characters not in the BASE64 set implicitly unshift the sequence
4617 so no '-' is required, except if the character is itself a '-' */
4618 if (IS_BASE64(ch) || ch == '-') {
4619 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004620 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004621 *out++ = (char) ch;
4622 }
4623 else {
4624 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004625 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004626 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004627 else { /* not in a shift sequence */
4628 if (ch == '+') {
4629 *out++ = '+';
4630 *out++ = '-';
4631 }
4632 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4633 *out++ = (char) ch;
4634 }
4635 else {
4636 *out++ = '+';
4637 inShift = 1;
4638 goto encode_char;
4639 }
4640 }
4641 continue;
4642encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004643 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004644 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004645
Antoine Pitrou244651a2009-05-04 18:56:13 +00004646 /* code first surrogate */
4647 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004648 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004649 while (base64bits >= 6) {
4650 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4651 base64bits -= 6;
4652 }
4653 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004654 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004655 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004656 base64bits += 16;
4657 base64buffer = (base64buffer << 16) | ch;
4658 while (base64bits >= 6) {
4659 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4660 base64bits -= 6;
4661 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004662 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004663 if (base64bits)
4664 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4665 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004666 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004667 if (_PyBytes_Resize(&v, out - start) < 0)
4668 return NULL;
4669 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004670}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004671PyObject *
4672PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4673 Py_ssize_t size,
4674 int base64SetO,
4675 int base64WhiteSpace,
4676 const char *errors)
4677{
4678 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004679 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004680 if (tmp == NULL)
4681 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004682 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004683 base64WhiteSpace, errors);
4684 Py_DECREF(tmp);
4685 return result;
4686}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004687
Antoine Pitrou244651a2009-05-04 18:56:13 +00004688#undef IS_BASE64
4689#undef FROM_BASE64
4690#undef TO_BASE64
4691#undef DECODE_DIRECT
4692#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004693
Guido van Rossumd57fd912000-03-10 22:53:23 +00004694/* --- UTF-8 Codec -------------------------------------------------------- */
4695
Alexander Belopolsky40018472011-02-26 01:02:56 +00004696PyObject *
4697PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004698 Py_ssize_t size,
4699 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004700{
Walter Dörwald69652032004-09-07 20:24:22 +00004701 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4702}
4703
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004704#include "stringlib/asciilib.h"
4705#include "stringlib/codecs.h"
4706#include "stringlib/undef.h"
4707
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004708#include "stringlib/ucs1lib.h"
4709#include "stringlib/codecs.h"
4710#include "stringlib/undef.h"
4711
4712#include "stringlib/ucs2lib.h"
4713#include "stringlib/codecs.h"
4714#include "stringlib/undef.h"
4715
4716#include "stringlib/ucs4lib.h"
4717#include "stringlib/codecs.h"
4718#include "stringlib/undef.h"
4719
Antoine Pitrouab868312009-01-10 15:40:25 +00004720/* Mask to quickly check whether a C 'long' contains a
4721 non-ASCII, UTF8-encoded char. */
4722#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004723# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004724#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004725# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004726#else
4727# error C 'long' size should be either 4 or 8!
4728#endif
4729
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004730static Py_ssize_t
4731ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004732{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004733 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004734 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004735
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004736 /*
4737 * Issue #17237: m68k is a bit different from most architectures in
4738 * that objects do not use "natural alignment" - for example, int and
4739 * long are only aligned at 2-byte boundaries. Therefore the assert()
4740 * won't work; also, tests have shown that skipping the "optimised
4741 * version" will even speed up m68k.
4742 */
4743#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004744#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004745 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4746 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004747 /* Fast path, see in STRINGLIB(utf8_decode) for
4748 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004749 /* Help allocation */
4750 const char *_p = p;
4751 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004752 while (_p < aligned_end) {
4753 unsigned long value = *(const unsigned long *) _p;
4754 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004755 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004756 *((unsigned long *)q) = value;
4757 _p += SIZEOF_LONG;
4758 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004759 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004760 p = _p;
4761 while (p < end) {
4762 if ((unsigned char)*p & 0x80)
4763 break;
4764 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004766 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004767 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004768#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004769#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004770 while (p < end) {
4771 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4772 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004773 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004774 /* Help allocation */
4775 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004776 while (_p < aligned_end) {
4777 unsigned long value = *(unsigned long *) _p;
4778 if (value & ASCII_CHAR_MASK)
4779 break;
4780 _p += SIZEOF_LONG;
4781 }
4782 p = _p;
4783 if (_p == end)
4784 break;
4785 }
4786 if ((unsigned char)*p & 0x80)
4787 break;
4788 ++p;
4789 }
4790 memcpy(dest, start, p - start);
4791 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792}
Antoine Pitrouab868312009-01-10 15:40:25 +00004793
Victor Stinner785938e2011-12-11 20:09:03 +01004794PyObject *
4795PyUnicode_DecodeUTF8Stateful(const char *s,
4796 Py_ssize_t size,
4797 const char *errors,
4798 Py_ssize_t *consumed)
4799{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004800 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004801 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004802 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004803
4804 Py_ssize_t startinpos;
4805 Py_ssize_t endinpos;
4806 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004807 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004808 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004809 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004810
4811 if (size == 0) {
4812 if (consumed)
4813 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004814 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004815 }
4816
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004817 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4818 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004819 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004820 *consumed = 1;
4821 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004822 }
4823
Victor Stinner8f674cc2013-04-17 23:02:17 +02004824 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004825 writer.min_length = size;
4826 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004827 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004828
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004829 writer.pos = ascii_decode(s, end, writer.data);
4830 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004831 while (s < end) {
4832 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004833 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004834
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004835 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004836 if (PyUnicode_IS_ASCII(writer.buffer))
4837 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004838 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004839 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004840 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004841 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004842 } else {
4843 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004844 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004845 }
4846
4847 switch (ch) {
4848 case 0:
4849 if (s == end || consumed)
4850 goto End;
4851 errmsg = "unexpected end of data";
4852 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004853 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004854 break;
4855 case 1:
4856 errmsg = "invalid start byte";
4857 startinpos = s - starts;
4858 endinpos = startinpos + 1;
4859 break;
4860 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004861 case 3:
4862 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004863 errmsg = "invalid continuation byte";
4864 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004865 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004866 break;
4867 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004868 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004869 goto onError;
4870 continue;
4871 }
4872
Victor Stinner1d65d912015-10-05 13:43:50 +02004873 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02004874 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02004875
4876 switch (error_handler) {
4877 case _Py_ERROR_IGNORE:
4878 s += (endinpos - startinpos);
4879 break;
4880
4881 case _Py_ERROR_REPLACE:
4882 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4883 goto onError;
4884 s += (endinpos - startinpos);
4885 break;
4886
4887 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004888 {
4889 Py_ssize_t i;
4890
Victor Stinner1d65d912015-10-05 13:43:50 +02004891 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4892 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004893 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004894 ch = (Py_UCS4)(unsigned char)(starts[i]);
4895 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4896 ch + 0xdc00);
4897 writer.pos++;
4898 }
4899 s += (endinpos - startinpos);
4900 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004901 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004902
4903 default:
4904 if (unicode_decode_call_errorhandler_writer(
4905 errors, &error_handler_obj,
4906 "utf-8", errmsg,
4907 &starts, &end, &startinpos, &endinpos, &exc, &s,
4908 &writer))
4909 goto onError;
4910 }
Victor Stinner785938e2011-12-11 20:09:03 +01004911 }
4912
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004913End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004914 if (consumed)
4915 *consumed = s - starts;
4916
Victor Stinner1d65d912015-10-05 13:43:50 +02004917 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004918 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004919 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004920
4921onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02004922 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004923 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004924 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004925 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004926}
4927
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004928
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004929/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
4930 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004931
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004932 On success, write a pointer to a newly allocated wide character string into
4933 *wstr (use PyMem_RawFree() to free the memory) and write the output length
4934 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004935
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004936 On memory allocation failure, return -1.
4937
4938 On decoding error (if surrogateescape is zero), return -2. If wlen is
4939 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
4940 is not NULL, write the decoding error message into *reason. */
4941int
4942_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02004943 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004944{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004945 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004946 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004947 wchar_t *unicode;
4948 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004949
Victor Stinner3d4226a2018-08-29 22:21:32 +02004950 int surrogateescape = 0;
4951 int surrogatepass = 0;
4952 switch (errors)
4953 {
4954 case _Py_ERROR_STRICT:
4955 break;
4956 case _Py_ERROR_SURROGATEESCAPE:
4957 surrogateescape = 1;
4958 break;
4959 case _Py_ERROR_SURROGATEPASS:
4960 surrogatepass = 1;
4961 break;
4962 default:
4963 return -3;
4964 }
4965
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004966 /* Note: size will always be longer than the resulting Unicode
4967 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01004968 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004969 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004970 }
4971
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004972 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01004973 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004974 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004975 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004976
4977 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004978 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004979 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004980 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004981 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004982#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004983 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004984#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004985 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004986#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004987 if (ch > 0xFF) {
4988#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07004989 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004990#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02004991 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004992 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004993 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4994 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4995#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004996 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004997 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02004998 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004999 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005000 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005001
5002 if (surrogateescape) {
5003 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5004 }
5005 else {
5006 /* Is it a valid three-byte code? */
5007 if (surrogatepass
5008 && (e - s) >= 3
5009 && (s[0] & 0xf0) == 0xe0
5010 && (s[1] & 0xc0) == 0x80
5011 && (s[2] & 0xc0) == 0x80)
5012 {
5013 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5014 s += 3;
5015 unicode[outpos++] = ch;
5016 }
5017 else {
5018 PyMem_RawFree(unicode );
5019 if (reason != NULL) {
5020 switch (ch) {
5021 case 0:
5022 *reason = "unexpected end of data";
5023 break;
5024 case 1:
5025 *reason = "invalid start byte";
5026 break;
5027 /* 2, 3, 4 */
5028 default:
5029 *reason = "invalid continuation byte";
5030 break;
5031 }
5032 }
5033 if (wlen != NULL) {
5034 *wlen = s - orig_s;
5035 }
5036 return -2;
5037 }
5038 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005039 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005040 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005041 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005042 if (wlen) {
5043 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005044 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005045 *wstr = unicode;
5046 return 0;
5047}
5048
5049wchar_t*
5050_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
5051{
5052 wchar_t *wstr;
5053 int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1);
5054 if (res != 0) {
5055 return NULL;
5056 }
5057 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005058}
5059
Antoine Pitrouab868312009-01-10 15:40:25 +00005060
Victor Stinnere47e6982017-12-21 15:45:16 +01005061/* UTF-8 encoder using the surrogateescape error handler .
5062
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005063 On success, return 0 and write the newly allocated character string (use
5064 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005065
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005066 On encoding failure, return -2 and write the position of the invalid
5067 surrogate character into *error_pos (if error_pos is set) and the decoding
5068 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005069
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005070 On memory allocation failure, return -1. */
5071int
5072_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005073 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005074{
5075 const Py_ssize_t max_char_size = 4;
5076 Py_ssize_t len = wcslen(text);
5077
5078 assert(len >= 0);
5079
Victor Stinner3d4226a2018-08-29 22:21:32 +02005080 int surrogateescape = 0;
5081 int surrogatepass = 0;
5082 switch (errors)
5083 {
5084 case _Py_ERROR_STRICT:
5085 break;
5086 case _Py_ERROR_SURROGATEESCAPE:
5087 surrogateescape = 1;
5088 break;
5089 case _Py_ERROR_SURROGATEPASS:
5090 surrogatepass = 1;
5091 break;
5092 default:
5093 return -3;
5094 }
5095
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005096 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5097 return -1;
5098 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005099 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005100 if (raw_malloc) {
5101 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005102 }
5103 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005104 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005105 }
5106 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005107 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005108 }
5109
5110 char *p = bytes;
5111 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005112 for (i = 0; i < len; ) {
5113 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005114 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005115 i++;
5116#if Py_UNICODE_SIZE == 2
5117 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5118 && i < len
5119 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5120 {
5121 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5122 i++;
5123 }
5124#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005125
5126 if (ch < 0x80) {
5127 /* Encode ASCII */
5128 *p++ = (char) ch;
5129
5130 }
5131 else if (ch < 0x0800) {
5132 /* Encode Latin-1 */
5133 *p++ = (char)(0xc0 | (ch >> 6));
5134 *p++ = (char)(0x80 | (ch & 0x3f));
5135 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005136 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005137 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005138 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005139 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005140 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005141 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005142 if (reason != NULL) {
5143 *reason = "encoding error";
5144 }
5145 if (raw_malloc) {
5146 PyMem_RawFree(bytes);
5147 }
5148 else {
5149 PyMem_Free(bytes);
5150 }
5151 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005152 }
5153 *p++ = (char)(ch & 0xff);
5154 }
5155 else if (ch < 0x10000) {
5156 *p++ = (char)(0xe0 | (ch >> 12));
5157 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5158 *p++ = (char)(0x80 | (ch & 0x3f));
5159 }
5160 else { /* ch >= 0x10000 */
5161 assert(ch <= MAX_UNICODE);
5162 /* Encode UCS4 Unicode ordinals */
5163 *p++ = (char)(0xf0 | (ch >> 18));
5164 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5165 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5166 *p++ = (char)(0x80 | (ch & 0x3f));
5167 }
5168 }
5169 *p++ = '\0';
5170
5171 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005172 char *bytes2;
5173 if (raw_malloc) {
5174 bytes2 = PyMem_RawRealloc(bytes, final_size);
5175 }
5176 else {
5177 bytes2 = PyMem_Realloc(bytes, final_size);
5178 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005179 if (bytes2 == NULL) {
5180 if (error_pos != NULL) {
5181 *error_pos = (size_t)-1;
5182 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005183 if (raw_malloc) {
5184 PyMem_RawFree(bytes);
5185 }
5186 else {
5187 PyMem_Free(bytes);
5188 }
5189 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005190 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005191 *str = bytes2;
5192 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005193}
5194
5195
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005196/* Primary internal function which creates utf8 encoded bytes objects.
5197
5198 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005199 and allocate exactly as much space needed at the end. Else allocate the
5200 maximum possible needed (4 result bytes per Unicode character), and return
5201 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005202*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005203PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005204_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205{
Victor Stinner6099a032011-12-18 14:22:26 +01005206 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005207 void *data;
5208 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005209
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005210 if (!PyUnicode_Check(unicode)) {
5211 PyErr_BadArgument();
5212 return NULL;
5213 }
5214
5215 if (PyUnicode_READY(unicode) == -1)
5216 return NULL;
5217
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005218 if (PyUnicode_UTF8(unicode))
5219 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5220 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005221
5222 kind = PyUnicode_KIND(unicode);
5223 data = PyUnicode_DATA(unicode);
5224 size = PyUnicode_GET_LENGTH(unicode);
5225
Benjamin Petersonead6b532011-12-20 17:23:42 -06005226 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005227 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005228 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005229 case PyUnicode_1BYTE_KIND:
5230 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5231 assert(!PyUnicode_IS_ASCII(unicode));
5232 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5233 case PyUnicode_2BYTE_KIND:
5234 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5235 case PyUnicode_4BYTE_KIND:
5236 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005237 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005238}
5239
Alexander Belopolsky40018472011-02-26 01:02:56 +00005240PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005241PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5242 Py_ssize_t size,
5243 const char *errors)
5244{
5245 PyObject *v, *unicode;
5246
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005247 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005248 if (unicode == NULL)
5249 return NULL;
5250 v = _PyUnicode_AsUTF8String(unicode, errors);
5251 Py_DECREF(unicode);
5252 return v;
5253}
5254
5255PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005256PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005258 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259}
5260
Walter Dörwald41980ca2007-08-16 21:55:45 +00005261/* --- UTF-32 Codec ------------------------------------------------------- */
5262
5263PyObject *
5264PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005265 Py_ssize_t size,
5266 const char *errors,
5267 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005268{
5269 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5270}
5271
5272PyObject *
5273PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005274 Py_ssize_t size,
5275 const char *errors,
5276 int *byteorder,
5277 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005278{
5279 const char *starts = s;
5280 Py_ssize_t startinpos;
5281 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005282 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005283 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005284 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005285 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005286 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005287 PyObject *errorHandler = NULL;
5288 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005289
Walter Dörwald41980ca2007-08-16 21:55:45 +00005290 q = (unsigned char *)s;
5291 e = q + size;
5292
5293 if (byteorder)
5294 bo = *byteorder;
5295
5296 /* Check for BOM marks (U+FEFF) in the input and adjust current
5297 byte order setting accordingly. In native mode, the leading BOM
5298 mark is skipped, in all other modes, it is copied to the output
5299 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005300 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005301 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005302 if (bom == 0x0000FEFF) {
5303 bo = -1;
5304 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005305 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005306 else if (bom == 0xFFFE0000) {
5307 bo = 1;
5308 q += 4;
5309 }
5310 if (byteorder)
5311 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005312 }
5313
Victor Stinnere64322e2012-10-30 23:12:47 +01005314 if (q == e) {
5315 if (consumed)
5316 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005317 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005318 }
5319
Victor Stinnere64322e2012-10-30 23:12:47 +01005320#ifdef WORDS_BIGENDIAN
5321 le = bo < 0;
5322#else
5323 le = bo <= 0;
5324#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005325 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005326
Victor Stinner8f674cc2013-04-17 23:02:17 +02005327 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005328 writer.min_length = (e - q + 3) / 4;
5329 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005330 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005331
Victor Stinnere64322e2012-10-30 23:12:47 +01005332 while (1) {
5333 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005334 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005335
Victor Stinnere64322e2012-10-30 23:12:47 +01005336 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005337 enum PyUnicode_Kind kind = writer.kind;
5338 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005339 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005340 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005341 if (le) {
5342 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005343 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005344 if (ch > maxch)
5345 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005346 if (kind != PyUnicode_1BYTE_KIND &&
5347 Py_UNICODE_IS_SURROGATE(ch))
5348 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005349 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005350 q += 4;
5351 } while (q <= last);
5352 }
5353 else {
5354 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005355 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005356 if (ch > maxch)
5357 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005358 if (kind != PyUnicode_1BYTE_KIND &&
5359 Py_UNICODE_IS_SURROGATE(ch))
5360 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005361 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005362 q += 4;
5363 } while (q <= last);
5364 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005365 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005366 }
5367
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005368 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005369 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005370 startinpos = ((const char *)q) - starts;
5371 endinpos = startinpos + 4;
5372 }
5373 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005374 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005375 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005376 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005377 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005378 startinpos = ((const char *)q) - starts;
5379 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005380 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005381 else {
5382 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005383 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005384 goto onError;
5385 q += 4;
5386 continue;
5387 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005388 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005389 startinpos = ((const char *)q) - starts;
5390 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005391 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005392
5393 /* The remaining input chars are ignored if the callback
5394 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005395 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005396 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005397 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005398 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005399 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005400 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005401 }
5402
Walter Dörwald41980ca2007-08-16 21:55:45 +00005403 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005404 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005405
Walter Dörwald41980ca2007-08-16 21:55:45 +00005406 Py_XDECREF(errorHandler);
5407 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005408 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005409
Benjamin Peterson29060642009-01-31 22:14:21 +00005410 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005411 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005412 Py_XDECREF(errorHandler);
5413 Py_XDECREF(exc);
5414 return NULL;
5415}
5416
5417PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005418_PyUnicode_EncodeUTF32(PyObject *str,
5419 const char *errors,
5420 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005421{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005422 enum PyUnicode_Kind kind;
5423 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005424 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005425 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005426 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005427#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005428 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005429#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005430 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005431#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005432 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005433 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005434 PyObject *errorHandler = NULL;
5435 PyObject *exc = NULL;
5436 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005437
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005438 if (!PyUnicode_Check(str)) {
5439 PyErr_BadArgument();
5440 return NULL;
5441 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005442 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005443 return NULL;
5444 kind = PyUnicode_KIND(str);
5445 data = PyUnicode_DATA(str);
5446 len = PyUnicode_GET_LENGTH(str);
5447
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005448 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005449 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005450 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005451 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005452 if (v == NULL)
5453 return NULL;
5454
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005455 /* output buffer is 4-bytes aligned */
5456 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005457 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005458 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005459 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005460 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005461 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005462
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005463 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005464 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005465 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005466 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005467 else
5468 encoding = "utf-32";
5469
5470 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005471 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5472 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005473 }
5474
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005475 pos = 0;
5476 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005477 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005478
5479 if (kind == PyUnicode_2BYTE_KIND) {
5480 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5481 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005482 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005483 else {
5484 assert(kind == PyUnicode_4BYTE_KIND);
5485 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5486 &out, native_ordering);
5487 }
5488 if (pos == len)
5489 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005490
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005491 rep = unicode_encode_call_errorhandler(
5492 errors, &errorHandler,
5493 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005494 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005495 if (!rep)
5496 goto error;
5497
5498 if (PyBytes_Check(rep)) {
5499 repsize = PyBytes_GET_SIZE(rep);
5500 if (repsize & 3) {
5501 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005502 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005503 "surrogates not allowed");
5504 goto error;
5505 }
5506 moreunits = repsize / 4;
5507 }
5508 else {
5509 assert(PyUnicode_Check(rep));
5510 if (PyUnicode_READY(rep) < 0)
5511 goto error;
5512 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5513 if (!PyUnicode_IS_ASCII(rep)) {
5514 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005515 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005516 "surrogates not allowed");
5517 goto error;
5518 }
5519 }
5520
5521 /* four bytes are reserved for each surrogate */
5522 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005523 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005524 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005525 /* integer overflow */
5526 PyErr_NoMemory();
5527 goto error;
5528 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005529 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005530 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005531 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005532 }
5533
5534 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005535 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005536 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005537 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005538 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005539 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5540 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005541 }
5542
5543 Py_CLEAR(rep);
5544 }
5545
5546 /* Cut back to size actually needed. This is necessary for, for example,
5547 encoding of a string containing isolated surrogates and the 'ignore'
5548 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005549 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005550 if (nsize != PyBytes_GET_SIZE(v))
5551 _PyBytes_Resize(&v, nsize);
5552 Py_XDECREF(errorHandler);
5553 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005554 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005555 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005556 error:
5557 Py_XDECREF(rep);
5558 Py_XDECREF(errorHandler);
5559 Py_XDECREF(exc);
5560 Py_XDECREF(v);
5561 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005562}
5563
Alexander Belopolsky40018472011-02-26 01:02:56 +00005564PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005565PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5566 Py_ssize_t size,
5567 const char *errors,
5568 int byteorder)
5569{
5570 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005571 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005572 if (tmp == NULL)
5573 return NULL;
5574 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5575 Py_DECREF(tmp);
5576 return result;
5577}
5578
5579PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005580PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005581{
Victor Stinnerb960b342011-11-20 19:12:52 +01005582 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005583}
5584
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585/* --- UTF-16 Codec ------------------------------------------------------- */
5586
Tim Peters772747b2001-08-09 22:21:55 +00005587PyObject *
5588PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005589 Py_ssize_t size,
5590 const char *errors,
5591 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592{
Walter Dörwald69652032004-09-07 20:24:22 +00005593 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5594}
5595
5596PyObject *
5597PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005598 Py_ssize_t size,
5599 const char *errors,
5600 int *byteorder,
5601 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005602{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005603 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005604 Py_ssize_t startinpos;
5605 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005606 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005607 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005608 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005609 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005610 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005611 PyObject *errorHandler = NULL;
5612 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005613 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614
Tim Peters772747b2001-08-09 22:21:55 +00005615 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005616 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005617
5618 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005619 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005621 /* Check for BOM marks (U+FEFF) in the input and adjust current
5622 byte order setting accordingly. In native mode, the leading BOM
5623 mark is skipped, in all other modes, it is copied to the output
5624 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005625 if (bo == 0 && size >= 2) {
5626 const Py_UCS4 bom = (q[1] << 8) | q[0];
5627 if (bom == 0xFEFF) {
5628 q += 2;
5629 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005630 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005631 else if (bom == 0xFFFE) {
5632 q += 2;
5633 bo = 1;
5634 }
5635 if (byteorder)
5636 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005637 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638
Antoine Pitrou63065d72012-05-15 23:48:04 +02005639 if (q == e) {
5640 if (consumed)
5641 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005642 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005643 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005644
Christian Heimes743e0cd2012-10-17 23:52:17 +02005645#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005646 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005647 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005648#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005649 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005650 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005651#endif
Tim Peters772747b2001-08-09 22:21:55 +00005652
Antoine Pitrou63065d72012-05-15 23:48:04 +02005653 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005654 character count normally. Error handler will take care of
5655 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005656 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005657 writer.min_length = (e - q + 1) / 2;
5658 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005659 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005660
Antoine Pitrou63065d72012-05-15 23:48:04 +02005661 while (1) {
5662 Py_UCS4 ch = 0;
5663 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005664 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005665 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005666 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005667 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005668 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005669 native_ordering);
5670 else
5671 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005672 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005673 native_ordering);
5674 } else if (kind == PyUnicode_2BYTE_KIND) {
5675 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005676 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005677 native_ordering);
5678 } else {
5679 assert(kind == PyUnicode_4BYTE_KIND);
5680 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005681 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005682 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005683 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005684 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005685
Antoine Pitrou63065d72012-05-15 23:48:04 +02005686 switch (ch)
5687 {
5688 case 0:
5689 /* remaining byte at the end? (size should be even) */
5690 if (q == e || consumed)
5691 goto End;
5692 errmsg = "truncated data";
5693 startinpos = ((const char *)q) - starts;
5694 endinpos = ((const char *)e) - starts;
5695 break;
5696 /* The remaining input chars are ignored if the callback
5697 chooses to skip the input */
5698 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005699 q -= 2;
5700 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005701 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005702 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005703 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005704 endinpos = ((const char *)e) - starts;
5705 break;
5706 case 2:
5707 errmsg = "illegal encoding";
5708 startinpos = ((const char *)q) - 2 - starts;
5709 endinpos = startinpos + 2;
5710 break;
5711 case 3:
5712 errmsg = "illegal UTF-16 surrogate";
5713 startinpos = ((const char *)q) - 4 - starts;
5714 endinpos = startinpos + 2;
5715 break;
5716 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005717 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005718 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005719 continue;
5720 }
5721
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005722 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005723 errors,
5724 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005725 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005726 &starts,
5727 (const char **)&e,
5728 &startinpos,
5729 &endinpos,
5730 &exc,
5731 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005732 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005733 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734 }
5735
Antoine Pitrou63065d72012-05-15 23:48:04 +02005736End:
Walter Dörwald69652032004-09-07 20:24:22 +00005737 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005739
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005740 Py_XDECREF(errorHandler);
5741 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005742 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743
Benjamin Peterson29060642009-01-31 22:14:21 +00005744 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005745 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005746 Py_XDECREF(errorHandler);
5747 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748 return NULL;
5749}
5750
Tim Peters772747b2001-08-09 22:21:55 +00005751PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005752_PyUnicode_EncodeUTF16(PyObject *str,
5753 const char *errors,
5754 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005756 enum PyUnicode_Kind kind;
5757 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005758 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005759 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005760 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005761 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005762#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005763 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005764#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005765 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005766#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005767 const char *encoding;
5768 Py_ssize_t nsize, pos;
5769 PyObject *errorHandler = NULL;
5770 PyObject *exc = NULL;
5771 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005772
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005773 if (!PyUnicode_Check(str)) {
5774 PyErr_BadArgument();
5775 return NULL;
5776 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005777 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005778 return NULL;
5779 kind = PyUnicode_KIND(str);
5780 data = PyUnicode_DATA(str);
5781 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005782
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005783 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005784 if (kind == PyUnicode_4BYTE_KIND) {
5785 const Py_UCS4 *in = (const Py_UCS4 *)data;
5786 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005787 while (in < end) {
5788 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005789 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005790 }
5791 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005792 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005793 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005794 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005795 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005796 nsize = len + pairs + (byteorder == 0);
5797 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005798 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005800 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005802 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005803 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005804 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005805 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005806 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005807 }
5808 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005809 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005810 }
Tim Peters772747b2001-08-09 22:21:55 +00005811
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005812 if (kind == PyUnicode_1BYTE_KIND) {
5813 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5814 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005815 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005816
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005817 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005818 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005819 }
5820 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005821 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005822 }
5823 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005824 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005825 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005826
5827 pos = 0;
5828 while (pos < len) {
5829 Py_ssize_t repsize, moreunits;
5830
5831 if (kind == PyUnicode_2BYTE_KIND) {
5832 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5833 &out, native_ordering);
5834 }
5835 else {
5836 assert(kind == PyUnicode_4BYTE_KIND);
5837 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5838 &out, native_ordering);
5839 }
5840 if (pos == len)
5841 break;
5842
5843 rep = unicode_encode_call_errorhandler(
5844 errors, &errorHandler,
5845 encoding, "surrogates not allowed",
5846 str, &exc, pos, pos + 1, &pos);
5847 if (!rep)
5848 goto error;
5849
5850 if (PyBytes_Check(rep)) {
5851 repsize = PyBytes_GET_SIZE(rep);
5852 if (repsize & 1) {
5853 raise_encode_exception(&exc, encoding,
5854 str, pos - 1, pos,
5855 "surrogates not allowed");
5856 goto error;
5857 }
5858 moreunits = repsize / 2;
5859 }
5860 else {
5861 assert(PyUnicode_Check(rep));
5862 if (PyUnicode_READY(rep) < 0)
5863 goto error;
5864 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5865 if (!PyUnicode_IS_ASCII(rep)) {
5866 raise_encode_exception(&exc, encoding,
5867 str, pos - 1, pos,
5868 "surrogates not allowed");
5869 goto error;
5870 }
5871 }
5872
5873 /* two bytes are reserved for each surrogate */
5874 if (moreunits > 1) {
5875 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005876 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005877 /* integer overflow */
5878 PyErr_NoMemory();
5879 goto error;
5880 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005881 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005882 goto error;
5883 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5884 }
5885
5886 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005887 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005888 out += moreunits;
5889 } else /* rep is unicode */ {
5890 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5891 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5892 &out, native_ordering);
5893 }
5894
5895 Py_CLEAR(rep);
5896 }
5897
5898 /* Cut back to size actually needed. This is necessary for, for example,
5899 encoding of a string containing isolated surrogates and the 'ignore' handler
5900 is used. */
5901 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5902 if (nsize != PyBytes_GET_SIZE(v))
5903 _PyBytes_Resize(&v, nsize);
5904 Py_XDECREF(errorHandler);
5905 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005906 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005907 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005908 error:
5909 Py_XDECREF(rep);
5910 Py_XDECREF(errorHandler);
5911 Py_XDECREF(exc);
5912 Py_XDECREF(v);
5913 return NULL;
5914#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915}
5916
Alexander Belopolsky40018472011-02-26 01:02:56 +00005917PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005918PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5919 Py_ssize_t size,
5920 const char *errors,
5921 int byteorder)
5922{
5923 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005924 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005925 if (tmp == NULL)
5926 return NULL;
5927 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5928 Py_DECREF(tmp);
5929 return result;
5930}
5931
5932PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005933PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005935 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936}
5937
5938/* --- Unicode Escape Codec ----------------------------------------------- */
5939
Fredrik Lundh06d12682001-01-24 07:59:11 +00005940static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005941
Alexander Belopolsky40018472011-02-26 01:02:56 +00005942PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005943_PyUnicode_DecodeUnicodeEscape(const char *s,
5944 Py_ssize_t size,
5945 const char *errors,
5946 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005948 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005949 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005951 PyObject *errorHandler = NULL;
5952 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005953
Eric V. Smith42454af2016-10-31 09:22:08 -04005954 // so we can remember if we've seen an invalid escape char or not
5955 *first_invalid_escape = NULL;
5956
Victor Stinner62ec3312016-09-06 17:04:34 -07005957 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005958 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005959 }
5960 /* Escaped strings will always be longer than the resulting
5961 Unicode string, so we start with size here and then reduce the
5962 length after conversion to the true value.
5963 (but if the error callback returns a long replacement string
5964 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005965 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005966 writer.min_length = size;
5967 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5968 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005969 }
5970
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971 end = s + size;
5972 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005973 unsigned char c = (unsigned char) *s++;
5974 Py_UCS4 ch;
5975 int count;
5976 Py_ssize_t startinpos;
5977 Py_ssize_t endinpos;
5978 const char *message;
5979
5980#define WRITE_ASCII_CHAR(ch) \
5981 do { \
5982 assert(ch <= 127); \
5983 assert(writer.pos < writer.size); \
5984 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5985 } while(0)
5986
5987#define WRITE_CHAR(ch) \
5988 do { \
5989 if (ch <= writer.maxchar) { \
5990 assert(writer.pos < writer.size); \
5991 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5992 } \
5993 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5994 goto onError; \
5995 } \
5996 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997
5998 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005999 if (c != '\\') {
6000 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001 continue;
6002 }
6003
Victor Stinner62ec3312016-09-06 17:04:34 -07006004 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006006 if (s >= end) {
6007 message = "\\ at end of string";
6008 goto error;
6009 }
6010 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006011
Victor Stinner62ec3312016-09-06 17:04:34 -07006012 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006013 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014
Benjamin Peterson29060642009-01-31 22:14:21 +00006015 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006016 case '\n': continue;
6017 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6018 case '\'': WRITE_ASCII_CHAR('\''); continue;
6019 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6020 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006021 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006022 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6023 case 't': WRITE_ASCII_CHAR('\t'); continue;
6024 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6025 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006026 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006027 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006028 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006029 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030
Benjamin Peterson29060642009-01-31 22:14:21 +00006031 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032 case '0': case '1': case '2': case '3':
6033 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006034 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006035 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006036 ch = (ch<<3) + *s++ - '0';
6037 if (s < end && '0' <= *s && *s <= '7') {
6038 ch = (ch<<3) + *s++ - '0';
6039 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006041 WRITE_CHAR(ch);
6042 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043
Benjamin Peterson29060642009-01-31 22:14:21 +00006044 /* hex escapes */
6045 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006047 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006048 message = "truncated \\xXX escape";
6049 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050
Benjamin Peterson29060642009-01-31 22:14:21 +00006051 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006053 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006054 message = "truncated \\uXXXX escape";
6055 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056
Benjamin Peterson29060642009-01-31 22:14:21 +00006057 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006058 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006059 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006060 message = "truncated \\UXXXXXXXX escape";
6061 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006062 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006063 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006064 ch <<= 4;
6065 if (c >= '0' && c <= '9') {
6066 ch += c - '0';
6067 }
6068 else if (c >= 'a' && c <= 'f') {
6069 ch += c - ('a' - 10);
6070 }
6071 else if (c >= 'A' && c <= 'F') {
6072 ch += c - ('A' - 10);
6073 }
6074 else {
6075 break;
6076 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006077 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006078 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006079 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006080 }
6081
6082 /* when we get here, ch is a 32-bit unicode character */
6083 if (ch > MAX_UNICODE) {
6084 message = "illegal Unicode character";
6085 goto error;
6086 }
6087
6088 WRITE_CHAR(ch);
6089 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006090
Benjamin Peterson29060642009-01-31 22:14:21 +00006091 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006092 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006093 if (ucnhash_CAPI == NULL) {
6094 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006095 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6096 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006097 if (ucnhash_CAPI == NULL) {
6098 PyErr_SetString(
6099 PyExc_UnicodeError,
6100 "\\N escapes not supported (can't load unicodedata module)"
6101 );
6102 goto onError;
6103 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006104 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006105
6106 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006107 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006108 const char *start = ++s;
6109 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006110 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006111 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006112 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006113 namelen = s - start;
6114 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006115 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006116 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006117 ch = 0xffffffff; /* in case 'getcode' messes up */
6118 if (namelen <= INT_MAX &&
6119 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6120 &ch, 0)) {
6121 assert(ch <= MAX_UNICODE);
6122 WRITE_CHAR(ch);
6123 continue;
6124 }
6125 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006126 }
6127 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006128 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006129
6130 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006131 if (*first_invalid_escape == NULL) {
6132 *first_invalid_escape = s-1; /* Back up one char, since we've
6133 already incremented s. */
6134 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006135 WRITE_ASCII_CHAR('\\');
6136 WRITE_CHAR(c);
6137 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006139
6140 error:
6141 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006142 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006143 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006144 errors, &errorHandler,
6145 "unicodeescape", message,
6146 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006147 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006148 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006149 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006150 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006151
6152#undef WRITE_ASCII_CHAR
6153#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006155
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006156 Py_XDECREF(errorHandler);
6157 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006158 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006159
Benjamin Peterson29060642009-01-31 22:14:21 +00006160 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006161 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006162 Py_XDECREF(errorHandler);
6163 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164 return NULL;
6165}
6166
Eric V. Smith42454af2016-10-31 09:22:08 -04006167PyObject *
6168PyUnicode_DecodeUnicodeEscape(const char *s,
6169 Py_ssize_t size,
6170 const char *errors)
6171{
6172 const char *first_invalid_escape;
6173 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6174 &first_invalid_escape);
6175 if (result == NULL)
6176 return NULL;
6177 if (first_invalid_escape != NULL) {
6178 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6179 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006180 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006181 Py_DECREF(result);
6182 return NULL;
6183 }
6184 }
6185 return result;
6186}
6187
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006188/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189
Alexander Belopolsky40018472011-02-26 01:02:56 +00006190PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006191PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006193 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006194 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006196 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006197 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006198 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199
Ezio Melottie7f90372012-10-05 03:33:31 +03006200 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006201 escape.
6202
Ezio Melottie7f90372012-10-05 03:33:31 +03006203 For UCS1 strings it's '\xxx', 4 bytes per source character.
6204 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6205 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006206 */
6207
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006208 if (!PyUnicode_Check(unicode)) {
6209 PyErr_BadArgument();
6210 return NULL;
6211 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006212 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006213 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006214 }
Victor Stinner358af132015-10-12 22:36:57 +02006215
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006216 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006217 if (len == 0) {
6218 return PyBytes_FromStringAndSize(NULL, 0);
6219 }
6220
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006221 kind = PyUnicode_KIND(unicode);
6222 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006223 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6224 bytes, and 1 byte characters 4. */
6225 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006226 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006227 return PyErr_NoMemory();
6228 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006229 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006230 if (repr == NULL) {
6231 return NULL;
6232 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006233
Victor Stinner62ec3312016-09-06 17:04:34 -07006234 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006235 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006236 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006237
Victor Stinner62ec3312016-09-06 17:04:34 -07006238 /* U+0000-U+00ff range */
6239 if (ch < 0x100) {
6240 if (ch >= ' ' && ch < 127) {
6241 if (ch != '\\') {
6242 /* Copy printable US ASCII as-is */
6243 *p++ = (char) ch;
6244 }
6245 /* Escape backslashes */
6246 else {
6247 *p++ = '\\';
6248 *p++ = '\\';
6249 }
6250 }
Victor Stinner358af132015-10-12 22:36:57 +02006251
Victor Stinner62ec3312016-09-06 17:04:34 -07006252 /* Map special whitespace to '\t', \n', '\r' */
6253 else if (ch == '\t') {
6254 *p++ = '\\';
6255 *p++ = 't';
6256 }
6257 else if (ch == '\n') {
6258 *p++ = '\\';
6259 *p++ = 'n';
6260 }
6261 else if (ch == '\r') {
6262 *p++ = '\\';
6263 *p++ = 'r';
6264 }
6265
6266 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6267 else {
6268 *p++ = '\\';
6269 *p++ = 'x';
6270 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6271 *p++ = Py_hexdigits[ch & 0x000F];
6272 }
Tim Petersced69f82003-09-16 20:30:58 +00006273 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006274 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006275 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276 *p++ = '\\';
6277 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006278 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6279 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6280 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6281 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006283 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6284 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006285
Victor Stinner62ec3312016-09-06 17:04:34 -07006286 /* Make sure that the first two digits are zero */
6287 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006288 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006289 *p++ = 'U';
6290 *p++ = '0';
6291 *p++ = '0';
6292 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6293 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6294 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6295 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6296 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6297 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006298 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300
Victor Stinner62ec3312016-09-06 17:04:34 -07006301 assert(p - PyBytes_AS_STRING(repr) > 0);
6302 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6303 return NULL;
6304 }
6305 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306}
6307
Alexander Belopolsky40018472011-02-26 01:02:56 +00006308PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006309PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6310 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006311{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006312 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006313 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006314 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006316 }
6317
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006318 result = PyUnicode_AsUnicodeEscapeString(tmp);
6319 Py_DECREF(tmp);
6320 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321}
6322
6323/* --- Raw Unicode Escape Codec ------------------------------------------- */
6324
Alexander Belopolsky40018472011-02-26 01:02:56 +00006325PyObject *
6326PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006327 Py_ssize_t size,
6328 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006329{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006330 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006331 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006333 PyObject *errorHandler = NULL;
6334 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006335
Victor Stinner62ec3312016-09-06 17:04:34 -07006336 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006337 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006338 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006339
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340 /* Escaped strings will always be longer than the resulting
6341 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006342 length after conversion to the true value. (But decoding error
6343 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006344 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006345 writer.min_length = size;
6346 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6347 goto onError;
6348 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006349
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350 end = s + size;
6351 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006352 unsigned char c = (unsigned char) *s++;
6353 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006354 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006355 Py_ssize_t startinpos;
6356 Py_ssize_t endinpos;
6357 const char *message;
6358
6359#define WRITE_CHAR(ch) \
6360 do { \
6361 if (ch <= writer.maxchar) { \
6362 assert(writer.pos < writer.size); \
6363 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6364 } \
6365 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6366 goto onError; \
6367 } \
6368 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369
Benjamin Peterson29060642009-01-31 22:14:21 +00006370 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006371 if (c != '\\' || s >= end) {
6372 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006373 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006374 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006375
Victor Stinner62ec3312016-09-06 17:04:34 -07006376 c = (unsigned char) *s++;
6377 if (c == 'u') {
6378 count = 4;
6379 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006380 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006381 else if (c == 'U') {
6382 count = 8;
6383 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006384 }
6385 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006386 assert(writer.pos < writer.size);
6387 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6388 WRITE_CHAR(c);
6389 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006390 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006391 startinpos = s - starts - 2;
6392
6393 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6394 for (ch = 0; count && s < end; ++s, --count) {
6395 c = (unsigned char)*s;
6396 ch <<= 4;
6397 if (c >= '0' && c <= '9') {
6398 ch += c - '0';
6399 }
6400 else if (c >= 'a' && c <= 'f') {
6401 ch += c - ('a' - 10);
6402 }
6403 else if (c >= 'A' && c <= 'F') {
6404 ch += c - ('A' - 10);
6405 }
6406 else {
6407 break;
6408 }
6409 }
6410 if (!count) {
6411 if (ch <= MAX_UNICODE) {
6412 WRITE_CHAR(ch);
6413 continue;
6414 }
6415 message = "\\Uxxxxxxxx out of range";
6416 }
6417
6418 endinpos = s-starts;
6419 writer.min_length = end - s + writer.pos;
6420 if (unicode_decode_call_errorhandler_writer(
6421 errors, &errorHandler,
6422 "rawunicodeescape", message,
6423 &starts, &end, &startinpos, &endinpos, &exc, &s,
6424 &writer)) {
6425 goto onError;
6426 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006427 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006428
6429#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006431 Py_XDECREF(errorHandler);
6432 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006433 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006434
Benjamin Peterson29060642009-01-31 22:14:21 +00006435 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006436 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006437 Py_XDECREF(errorHandler);
6438 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006440
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441}
6442
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006443
Alexander Belopolsky40018472011-02-26 01:02:56 +00006444PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006445PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446{
Victor Stinner62ec3312016-09-06 17:04:34 -07006447 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006449 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006450 int kind;
6451 void *data;
6452 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006454 if (!PyUnicode_Check(unicode)) {
6455 PyErr_BadArgument();
6456 return NULL;
6457 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006458 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006459 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006460 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006461 kind = PyUnicode_KIND(unicode);
6462 data = PyUnicode_DATA(unicode);
6463 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006464 if (kind == PyUnicode_1BYTE_KIND) {
6465 return PyBytes_FromStringAndSize(data, len);
6466 }
Victor Stinner0e368262011-11-10 20:12:49 +01006467
Victor Stinner62ec3312016-09-06 17:04:34 -07006468 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6469 bytes, and 1 byte characters 4. */
6470 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006471
Victor Stinner62ec3312016-09-06 17:04:34 -07006472 if (len > PY_SSIZE_T_MAX / expandsize) {
6473 return PyErr_NoMemory();
6474 }
6475 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6476 if (repr == NULL) {
6477 return NULL;
6478 }
6479 if (len == 0) {
6480 return repr;
6481 }
6482
6483 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006484 for (pos = 0; pos < len; pos++) {
6485 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006486
Victor Stinner62ec3312016-09-06 17:04:34 -07006487 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6488 if (ch < 0x100) {
6489 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006490 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006491 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006492 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493 *p++ = '\\';
6494 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006495 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6496 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6497 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6498 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006500 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6501 else {
6502 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6503 *p++ = '\\';
6504 *p++ = 'U';
6505 *p++ = '0';
6506 *p++ = '0';
6507 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6508 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6509 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6510 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6511 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6512 *p++ = Py_hexdigits[ch & 15];
6513 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006515
Victor Stinner62ec3312016-09-06 17:04:34 -07006516 assert(p > PyBytes_AS_STRING(repr));
6517 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6518 return NULL;
6519 }
6520 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521}
6522
Alexander Belopolsky40018472011-02-26 01:02:56 +00006523PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006524PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6525 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006527 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006528 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006529 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006530 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006531 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6532 Py_DECREF(tmp);
6533 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534}
6535
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006536/* --- Unicode Internal Codec ------------------------------------------- */
6537
Alexander Belopolsky40018472011-02-26 01:02:56 +00006538PyObject *
6539_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006540 Py_ssize_t size,
6541 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006542{
6543 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006544 Py_ssize_t startinpos;
6545 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006546 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006547 const char *end;
6548 const char *reason;
6549 PyObject *errorHandler = NULL;
6550 PyObject *exc = NULL;
6551
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006552 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006553 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006554 1))
6555 return NULL;
6556
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006557 if (size < 0) {
6558 PyErr_BadInternalCall();
6559 return NULL;
6560 }
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006561 if (size == 0)
6562 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006563
Victor Stinner8f674cc2013-04-17 23:02:17 +02006564 _PyUnicodeWriter_Init(&writer);
6565 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6566 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006567 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006568 }
6569 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006570
Victor Stinner8f674cc2013-04-17 23:02:17 +02006571 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006572 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006573 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006574 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006575 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006576 endinpos = end-starts;
6577 reason = "truncated input";
6578 goto error;
6579 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006580 /* We copy the raw representation one byte at a time because the
6581 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006582 ((char *) &uch)[0] = s[0];
6583 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006584#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006585 ((char *) &uch)[2] = s[2];
6586 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006587#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006588 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006589#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006590 /* We have to sanity check the raw data, otherwise doom looms for
6591 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006592 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006593 endinpos = s - starts + Py_UNICODE_SIZE;
6594 reason = "illegal code point (> 0x10FFFF)";
6595 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006596 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006597#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006598 s += Py_UNICODE_SIZE;
6599#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006600 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006601 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006602 Py_UNICODE uch2;
6603 ((char *) &uch2)[0] = s[0];
6604 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006605 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006606 {
Victor Stinner551ac952011-11-29 22:58:13 +01006607 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006608 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006609 }
6610 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006611#endif
6612
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006613 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006614 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006615 continue;
6616
6617 error:
6618 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006619 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006620 errors, &errorHandler,
6621 "unicode_internal", reason,
6622 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006623 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006624 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006625 }
6626
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006627 Py_XDECREF(errorHandler);
6628 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006629 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006630
Benjamin Peterson29060642009-01-31 22:14:21 +00006631 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006632 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006633 Py_XDECREF(errorHandler);
6634 Py_XDECREF(exc);
6635 return NULL;
6636}
6637
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638/* --- Latin-1 Codec ------------------------------------------------------ */
6639
Alexander Belopolsky40018472011-02-26 01:02:56 +00006640PyObject *
6641PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006642 Py_ssize_t size,
6643 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006646 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647}
6648
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006649/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006650static void
6651make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006652 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006653 PyObject *unicode,
6654 Py_ssize_t startpos, Py_ssize_t endpos,
6655 const char *reason)
6656{
6657 if (*exceptionObject == NULL) {
6658 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006659 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006660 encoding, unicode, startpos, endpos, reason);
6661 }
6662 else {
6663 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6664 goto onError;
6665 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6666 goto onError;
6667 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6668 goto onError;
6669 return;
6670 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006671 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006672 }
6673}
6674
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006675/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006676static void
6677raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006678 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006679 PyObject *unicode,
6680 Py_ssize_t startpos, Py_ssize_t endpos,
6681 const char *reason)
6682{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006683 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006684 encoding, unicode, startpos, endpos, reason);
6685 if (*exceptionObject != NULL)
6686 PyCodec_StrictErrors(*exceptionObject);
6687}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006688
6689/* error handling callback helper:
6690 build arguments, call the callback and check the arguments,
6691 put the result into newpos and return the replacement string, which
6692 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006693static PyObject *
6694unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006695 PyObject **errorHandler,
6696 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006697 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006698 Py_ssize_t startpos, Py_ssize_t endpos,
6699 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006700{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006701 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006702 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006703 PyObject *restuple;
6704 PyObject *resunicode;
6705
6706 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006707 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006708 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006709 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006710 }
6711
Benjamin Petersonbac79492012-01-14 13:34:47 -05006712 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006713 return NULL;
6714 len = PyUnicode_GET_LENGTH(unicode);
6715
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006716 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006717 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006718 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006719 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006720
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006721 restuple = PyObject_CallFunctionObjArgs(
6722 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006723 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006724 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006725 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006726 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 Py_DECREF(restuple);
6728 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006729 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006730 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006731 &resunicode, newpos)) {
6732 Py_DECREF(restuple);
6733 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006734 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006735 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6736 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6737 Py_DECREF(restuple);
6738 return NULL;
6739 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006740 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006741 *newpos = len + *newpos;
6742 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006743 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006744 Py_DECREF(restuple);
6745 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006746 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006747 Py_INCREF(resunicode);
6748 Py_DECREF(restuple);
6749 return resunicode;
6750}
6751
Alexander Belopolsky40018472011-02-26 01:02:56 +00006752static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006753unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006754 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006755 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006756{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006757 /* input state */
6758 Py_ssize_t pos=0, size;
6759 int kind;
6760 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006761 /* pointer into the output */
6762 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006763 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6764 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006765 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006766 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006767 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006768 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006769 /* output object */
6770 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006771
Benjamin Petersonbac79492012-01-14 13:34:47 -05006772 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006773 return NULL;
6774 size = PyUnicode_GET_LENGTH(unicode);
6775 kind = PyUnicode_KIND(unicode);
6776 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006777 /* allocate enough for a simple encoding without
6778 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006779 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006780 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006781
6782 _PyBytesWriter_Init(&writer);
6783 str = _PyBytesWriter_Alloc(&writer, size);
6784 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006785 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006786
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006787 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006788 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006789
Benjamin Peterson29060642009-01-31 22:14:21 +00006790 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006791 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006792 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006793 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006794 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006795 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006796 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006797 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006798 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006799 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006800 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006801 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006802
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006803 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006804 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006805
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006806 /* Only overallocate the buffer if it's not the last write */
6807 writer.overallocate = (collend < size);
6808
Benjamin Peterson29060642009-01-31 22:14:21 +00006809 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006810 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006811 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006812
6813 switch (error_handler) {
6814 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006815 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006816 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006817
6818 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006819 memset(str, '?', collend - collstart);
6820 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006821 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006822 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006823 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006824 break;
Victor Stinner50149202015-09-22 00:26:54 +02006825
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006826 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006827 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006828 writer.min_size -= (collend - collstart);
6829 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006830 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006831 if (str == NULL)
6832 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006833 pos = collend;
6834 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006835
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006836 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006837 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006838 writer.min_size -= (collend - collstart);
6839 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006840 unicode, collstart, collend);
6841 if (str == NULL)
6842 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006843 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006844 break;
Victor Stinner50149202015-09-22 00:26:54 +02006845
Victor Stinnerc3713e92015-09-29 12:32:13 +02006846 case _Py_ERROR_SURROGATEESCAPE:
6847 for (i = collstart; i < collend; ++i) {
6848 ch = PyUnicode_READ(kind, data, i);
6849 if (ch < 0xdc80 || 0xdcff < ch) {
6850 /* Not a UTF-8b surrogate */
6851 break;
6852 }
6853 *str++ = (char)(ch - 0xdc00);
6854 ++pos;
6855 }
6856 if (i >= collend)
6857 break;
6858 collstart = pos;
6859 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006860 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006861
Benjamin Peterson29060642009-01-31 22:14:21 +00006862 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006863 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6864 encoding, reason, unicode, &exc,
6865 collstart, collend, &newpos);
6866 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006867 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006868
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006869 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006870 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006871
Victor Stinner6bd525b2015-10-09 13:10:05 +02006872 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006873 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006874 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006875 PyBytes_AS_STRING(rep),
6876 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006877 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006878 else {
6879 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006880
Victor Stinner6bd525b2015-10-09 13:10:05 +02006881 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006882 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006883
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006884 if (limit == 256 ?
6885 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6886 !PyUnicode_IS_ASCII(rep))
6887 {
6888 /* Not all characters are smaller than limit */
6889 raise_encode_exception(&exc, encoding, unicode,
6890 collstart, collend, reason);
6891 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006892 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006893 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6894 str = _PyBytesWriter_WriteBytes(&writer, str,
6895 PyUnicode_DATA(rep),
6896 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006897 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006898 if (str == NULL)
6899 goto onError;
6900
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006901 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006902 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006903 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006904
6905 /* If overallocation was disabled, ensure that it was the last
6906 write. Otherwise, we missed an optimization */
6907 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006908 }
6909 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006910
Victor Stinner50149202015-09-22 00:26:54 +02006911 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006912 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006913 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006914
6915 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006916 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006917 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006918 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006919 Py_XDECREF(exc);
6920 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006921}
6922
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006923/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006924PyObject *
6925PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006926 Py_ssize_t size,
6927 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006929 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006930 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006931 if (unicode == NULL)
6932 return NULL;
6933 result = unicode_encode_ucs1(unicode, errors, 256);
6934 Py_DECREF(unicode);
6935 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936}
6937
Alexander Belopolsky40018472011-02-26 01:02:56 +00006938PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006939_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940{
6941 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006942 PyErr_BadArgument();
6943 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006945 if (PyUnicode_READY(unicode) == -1)
6946 return NULL;
6947 /* Fast path: if it is a one-byte string, construct
6948 bytes object directly. */
6949 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6950 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6951 PyUnicode_GET_LENGTH(unicode));
6952 /* Non-Latin-1 characters present. Defer to above function to
6953 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006954 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006955}
6956
6957PyObject*
6958PyUnicode_AsLatin1String(PyObject *unicode)
6959{
6960 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961}
6962
6963/* --- 7-bit ASCII Codec -------------------------------------------------- */
6964
Alexander Belopolsky40018472011-02-26 01:02:56 +00006965PyObject *
6966PyUnicode_DecodeASCII(const char *s,
6967 Py_ssize_t size,
6968 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006970 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006971 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006972 int kind;
6973 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006974 Py_ssize_t startinpos;
6975 Py_ssize_t endinpos;
6976 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006977 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006978 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006979 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006980 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006981
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006983 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006984
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006986 if (size == 1 && (unsigned char)s[0] < 128)
6987 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006988
Victor Stinner8f674cc2013-04-17 23:02:17 +02006989 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006990 writer.min_length = size;
6991 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006992 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006993
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006994 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006995 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006996 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006997 writer.pos = outpos;
6998 if (writer.pos == size)
6999 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007000
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007001 s += writer.pos;
7002 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007003 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007004 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007005 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007006 PyUnicode_WRITE(kind, data, writer.pos, c);
7007 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007008 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007009 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007010 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007011
7012 /* byte outsize range 0x00..0x7f: call the error handler */
7013
7014 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007015 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007016
7017 switch (error_handler)
7018 {
7019 case _Py_ERROR_REPLACE:
7020 case _Py_ERROR_SURROGATEESCAPE:
7021 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007022 but we may switch to UCS2 at the first write */
7023 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7024 goto onError;
7025 kind = writer.kind;
7026 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007027
7028 if (error_handler == _Py_ERROR_REPLACE)
7029 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7030 else
7031 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7032 writer.pos++;
7033 ++s;
7034 break;
7035
7036 case _Py_ERROR_IGNORE:
7037 ++s;
7038 break;
7039
7040 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007041 startinpos = s-starts;
7042 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007043 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007044 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007045 "ascii", "ordinal not in range(128)",
7046 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007047 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007048 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007049 kind = writer.kind;
7050 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007053 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007054 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007055 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007056
Benjamin Peterson29060642009-01-31 22:14:21 +00007057 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007058 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007059 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007060 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061 return NULL;
7062}
7063
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007064/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007065PyObject *
7066PyUnicode_EncodeASCII(const Py_UNICODE *p,
7067 Py_ssize_t size,
7068 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007070 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007071 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007072 if (unicode == NULL)
7073 return NULL;
7074 result = unicode_encode_ucs1(unicode, errors, 128);
7075 Py_DECREF(unicode);
7076 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007077}
7078
Alexander Belopolsky40018472011-02-26 01:02:56 +00007079PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007080_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081{
7082 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007083 PyErr_BadArgument();
7084 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007085 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007086 if (PyUnicode_READY(unicode) == -1)
7087 return NULL;
7088 /* Fast path: if it is an ASCII-only string, construct bytes object
7089 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007090 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007091 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7092 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007093 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007094}
7095
7096PyObject *
7097PyUnicode_AsASCIIString(PyObject *unicode)
7098{
7099 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007100}
7101
Steve Dowercc16be82016-09-08 10:35:16 -07007102#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007103
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007104/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007105
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007106#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007107#define NEED_RETRY
7108#endif
7109
Victor Stinner3a50e702011-10-18 21:21:00 +02007110#ifndef WC_ERR_INVALID_CHARS
7111# define WC_ERR_INVALID_CHARS 0x0080
7112#endif
7113
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007114static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007115code_page_name(UINT code_page, PyObject **obj)
7116{
7117 *obj = NULL;
7118 if (code_page == CP_ACP)
7119 return "mbcs";
7120 if (code_page == CP_UTF7)
7121 return "CP_UTF7";
7122 if (code_page == CP_UTF8)
7123 return "CP_UTF8";
7124
7125 *obj = PyBytes_FromFormat("cp%u", code_page);
7126 if (*obj == NULL)
7127 return NULL;
7128 return PyBytes_AS_STRING(*obj);
7129}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007130
Victor Stinner3a50e702011-10-18 21:21:00 +02007131static DWORD
7132decode_code_page_flags(UINT code_page)
7133{
7134 if (code_page == CP_UTF7) {
7135 /* The CP_UTF7 decoder only supports flags=0 */
7136 return 0;
7137 }
7138 else
7139 return MB_ERR_INVALID_CHARS;
7140}
7141
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007142/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007143 * Decode a byte string from a Windows code page into unicode object in strict
7144 * mode.
7145 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007146 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7147 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007148 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007149static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007150decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007151 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007152 const char *in,
7153 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007154{
Victor Stinner3a50e702011-10-18 21:21:00 +02007155 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007156 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007157 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007158
7159 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007160 assert(insize > 0);
7161 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7162 if (outsize <= 0)
7163 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007164
7165 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007166 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007167 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007168 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007169 if (*v == NULL)
7170 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007171 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007172 }
7173 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007174 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007175 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007176 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007177 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007178 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007179 }
7180
7181 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007182 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7183 if (outsize <= 0)
7184 goto error;
7185 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007186
Victor Stinner3a50e702011-10-18 21:21:00 +02007187error:
7188 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7189 return -2;
7190 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007191 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007192}
7193
Victor Stinner3a50e702011-10-18 21:21:00 +02007194/*
7195 * Decode a byte string from a code page into unicode object with an error
7196 * handler.
7197 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007198 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007199 * UnicodeDecodeError exception and returns -1 on error.
7200 */
7201static int
7202decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007203 PyObject **v,
7204 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007205 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007206{
7207 const char *startin = in;
7208 const char *endin = in + size;
7209 const DWORD flags = decode_code_page_flags(code_page);
7210 /* Ideally, we should get reason from FormatMessage. This is the Windows
7211 2000 English version of the message. */
7212 const char *reason = "No mapping for the Unicode character exists "
7213 "in the target code page.";
7214 /* each step cannot decode more than 1 character, but a character can be
7215 represented as a surrogate pair */
7216 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007217 int insize;
7218 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007219 PyObject *errorHandler = NULL;
7220 PyObject *exc = NULL;
7221 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007222 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007223 DWORD err;
7224 int ret = -1;
7225
7226 assert(size > 0);
7227
7228 encoding = code_page_name(code_page, &encoding_obj);
7229 if (encoding == NULL)
7230 return -1;
7231
Victor Stinner7d00cc12014-03-17 23:08:06 +01007232 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007233 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7234 UnicodeDecodeError. */
7235 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7236 if (exc != NULL) {
7237 PyCodec_StrictErrors(exc);
7238 Py_CLEAR(exc);
7239 }
7240 goto error;
7241 }
7242
7243 if (*v == NULL) {
7244 /* Create unicode object */
7245 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7246 PyErr_NoMemory();
7247 goto error;
7248 }
Victor Stinnerab595942011-12-17 04:59:06 +01007249 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007250 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007251 if (*v == NULL)
7252 goto error;
7253 startout = PyUnicode_AS_UNICODE(*v);
7254 }
7255 else {
7256 /* Extend unicode object */
7257 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7258 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7259 PyErr_NoMemory();
7260 goto error;
7261 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007262 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007263 goto error;
7264 startout = PyUnicode_AS_UNICODE(*v) + n;
7265 }
7266
7267 /* Decode the byte string character per character */
7268 out = startout;
7269 while (in < endin)
7270 {
7271 /* Decode a character */
7272 insize = 1;
7273 do
7274 {
7275 outsize = MultiByteToWideChar(code_page, flags,
7276 in, insize,
7277 buffer, Py_ARRAY_LENGTH(buffer));
7278 if (outsize > 0)
7279 break;
7280 err = GetLastError();
7281 if (err != ERROR_NO_UNICODE_TRANSLATION
7282 && err != ERROR_INSUFFICIENT_BUFFER)
7283 {
7284 PyErr_SetFromWindowsErr(0);
7285 goto error;
7286 }
7287 insize++;
7288 }
7289 /* 4=maximum length of a UTF-8 sequence */
7290 while (insize <= 4 && (in + insize) <= endin);
7291
7292 if (outsize <= 0) {
7293 Py_ssize_t startinpos, endinpos, outpos;
7294
Victor Stinner7d00cc12014-03-17 23:08:06 +01007295 /* last character in partial decode? */
7296 if (in + insize >= endin && !final)
7297 break;
7298
Victor Stinner3a50e702011-10-18 21:21:00 +02007299 startinpos = in - startin;
7300 endinpos = startinpos + 1;
7301 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007302 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007303 errors, &errorHandler,
7304 encoding, reason,
7305 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007306 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007307 {
7308 goto error;
7309 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007310 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007311 }
7312 else {
7313 in += insize;
7314 memcpy(out, buffer, outsize * sizeof(wchar_t));
7315 out += outsize;
7316 }
7317 }
7318
7319 /* write a NUL character at the end */
7320 *out = 0;
7321
7322 /* Extend unicode object */
7323 outsize = out - startout;
7324 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007325 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007326 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007327 /* (in - startin) <= size and size is an int */
7328 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007329
7330error:
7331 Py_XDECREF(encoding_obj);
7332 Py_XDECREF(errorHandler);
7333 Py_XDECREF(exc);
7334 return ret;
7335}
7336
Victor Stinner3a50e702011-10-18 21:21:00 +02007337static PyObject *
7338decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007339 const char *s, Py_ssize_t size,
7340 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007341{
Victor Stinner76a31a62011-11-04 00:05:13 +01007342 PyObject *v = NULL;
7343 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007344
Victor Stinner3a50e702011-10-18 21:21:00 +02007345 if (code_page < 0) {
7346 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7347 return NULL;
7348 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007349 if (size < 0) {
7350 PyErr_BadInternalCall();
7351 return NULL;
7352 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007353
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007354 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007355 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007356
Victor Stinner76a31a62011-11-04 00:05:13 +01007357 do
7358 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007359#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007360 if (size > INT_MAX) {
7361 chunk_size = INT_MAX;
7362 final = 0;
7363 done = 0;
7364 }
7365 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007366#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007367 {
7368 chunk_size = (int)size;
7369 final = (consumed == NULL);
7370 done = 1;
7371 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007372
Victor Stinner76a31a62011-11-04 00:05:13 +01007373 if (chunk_size == 0 && done) {
7374 if (v != NULL)
7375 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007376 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007377 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007378
Victor Stinner76a31a62011-11-04 00:05:13 +01007379 converted = decode_code_page_strict(code_page, &v,
7380 s, chunk_size);
7381 if (converted == -2)
7382 converted = decode_code_page_errors(code_page, &v,
7383 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007384 errors, final);
7385 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007386
7387 if (converted < 0) {
7388 Py_XDECREF(v);
7389 return NULL;
7390 }
7391
7392 if (consumed)
7393 *consumed += converted;
7394
7395 s += converted;
7396 size -= converted;
7397 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007398
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007399 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007400}
7401
Alexander Belopolsky40018472011-02-26 01:02:56 +00007402PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007403PyUnicode_DecodeCodePageStateful(int code_page,
7404 const char *s,
7405 Py_ssize_t size,
7406 const char *errors,
7407 Py_ssize_t *consumed)
7408{
7409 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7410}
7411
7412PyObject *
7413PyUnicode_DecodeMBCSStateful(const char *s,
7414 Py_ssize_t size,
7415 const char *errors,
7416 Py_ssize_t *consumed)
7417{
7418 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7419}
7420
7421PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007422PyUnicode_DecodeMBCS(const char *s,
7423 Py_ssize_t size,
7424 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007425{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007426 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7427}
7428
Victor Stinner3a50e702011-10-18 21:21:00 +02007429static DWORD
7430encode_code_page_flags(UINT code_page, const char *errors)
7431{
7432 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007433 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007434 }
7435 else if (code_page == CP_UTF7) {
7436 /* CP_UTF7 only supports flags=0 */
7437 return 0;
7438 }
7439 else {
7440 if (errors != NULL && strcmp(errors, "replace") == 0)
7441 return 0;
7442 else
7443 return WC_NO_BEST_FIT_CHARS;
7444 }
7445}
7446
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007447/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007448 * Encode a Unicode string to a Windows code page into a byte string in strict
7449 * mode.
7450 *
7451 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007452 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007453 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007454static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007455encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007456 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007457 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007458{
Victor Stinner554f3f02010-06-16 23:33:54 +00007459 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007460 BOOL *pusedDefaultChar = &usedDefaultChar;
7461 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007462 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007463 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007464 const DWORD flags = encode_code_page_flags(code_page, NULL);
7465 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007466 /* Create a substring so that we can get the UTF-16 representation
7467 of just the slice under consideration. */
7468 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007469
Martin v. Löwis3d325192011-11-04 18:23:06 +01007470 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007471
Victor Stinner3a50e702011-10-18 21:21:00 +02007472 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007473 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007474 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007475 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007476
Victor Stinner2fc507f2011-11-04 20:06:39 +01007477 substring = PyUnicode_Substring(unicode, offset, offset+len);
7478 if (substring == NULL)
7479 return -1;
7480 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7481 if (p == NULL) {
7482 Py_DECREF(substring);
7483 return -1;
7484 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007485 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007486
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007487 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007488 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007489 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007490 NULL, 0,
7491 NULL, pusedDefaultChar);
7492 if (outsize <= 0)
7493 goto error;
7494 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007495 if (pusedDefaultChar && *pusedDefaultChar) {
7496 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007497 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007498 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007499
Victor Stinner3a50e702011-10-18 21:21:00 +02007500 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007501 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007502 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007503 if (*outbytes == NULL) {
7504 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007505 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007506 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007507 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007508 }
7509 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007510 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007511 const Py_ssize_t n = PyBytes_Size(*outbytes);
7512 if (outsize > PY_SSIZE_T_MAX - n) {
7513 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007514 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007515 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007516 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007517 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7518 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007519 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007520 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007521 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007522 }
7523
7524 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007525 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007526 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007527 out, outsize,
7528 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007529 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007530 if (outsize <= 0)
7531 goto error;
7532 if (pusedDefaultChar && *pusedDefaultChar)
7533 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007534 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007535
Victor Stinner3a50e702011-10-18 21:21:00 +02007536error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007537 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007538 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7539 return -2;
7540 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007541 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007542}
7543
Victor Stinner3a50e702011-10-18 21:21:00 +02007544/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007545 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007546 * error handler.
7547 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007548 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007549 * -1 on other error.
7550 */
7551static int
7552encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007553 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007554 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007555{
Victor Stinner3a50e702011-10-18 21:21:00 +02007556 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007557 Py_ssize_t pos = unicode_offset;
7558 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007559 /* Ideally, we should get reason from FormatMessage. This is the Windows
7560 2000 English version of the message. */
7561 const char *reason = "invalid character";
7562 /* 4=maximum length of a UTF-8 sequence */
7563 char buffer[4];
7564 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7565 Py_ssize_t outsize;
7566 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007567 PyObject *errorHandler = NULL;
7568 PyObject *exc = NULL;
7569 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007570 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007571 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007572 PyObject *rep;
7573 int ret = -1;
7574
7575 assert(insize > 0);
7576
7577 encoding = code_page_name(code_page, &encoding_obj);
7578 if (encoding == NULL)
7579 return -1;
7580
7581 if (errors == NULL || strcmp(errors, "strict") == 0) {
7582 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7583 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007584 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007585 if (exc != NULL) {
7586 PyCodec_StrictErrors(exc);
7587 Py_DECREF(exc);
7588 }
7589 Py_XDECREF(encoding_obj);
7590 return -1;
7591 }
7592
7593 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7594 pusedDefaultChar = &usedDefaultChar;
7595 else
7596 pusedDefaultChar = NULL;
7597
7598 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7599 PyErr_NoMemory();
7600 goto error;
7601 }
7602 outsize = insize * Py_ARRAY_LENGTH(buffer);
7603
7604 if (*outbytes == NULL) {
7605 /* Create string object */
7606 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7607 if (*outbytes == NULL)
7608 goto error;
7609 out = PyBytes_AS_STRING(*outbytes);
7610 }
7611 else {
7612 /* Extend string object */
7613 Py_ssize_t n = PyBytes_Size(*outbytes);
7614 if (n > PY_SSIZE_T_MAX - outsize) {
7615 PyErr_NoMemory();
7616 goto error;
7617 }
7618 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7619 goto error;
7620 out = PyBytes_AS_STRING(*outbytes) + n;
7621 }
7622
7623 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007624 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007625 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007626 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7627 wchar_t chars[2];
7628 int charsize;
7629 if (ch < 0x10000) {
7630 chars[0] = (wchar_t)ch;
7631 charsize = 1;
7632 }
7633 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007634 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7635 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007636 charsize = 2;
7637 }
7638
Victor Stinner3a50e702011-10-18 21:21:00 +02007639 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007640 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007641 buffer, Py_ARRAY_LENGTH(buffer),
7642 NULL, pusedDefaultChar);
7643 if (outsize > 0) {
7644 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7645 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007646 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007647 memcpy(out, buffer, outsize);
7648 out += outsize;
7649 continue;
7650 }
7651 }
7652 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7653 PyErr_SetFromWindowsErr(0);
7654 goto error;
7655 }
7656
Victor Stinner3a50e702011-10-18 21:21:00 +02007657 rep = unicode_encode_call_errorhandler(
7658 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007659 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007660 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007661 if (rep == NULL)
7662 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007663 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007664
7665 if (PyBytes_Check(rep)) {
7666 outsize = PyBytes_GET_SIZE(rep);
7667 if (outsize != 1) {
7668 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7669 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7670 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7671 Py_DECREF(rep);
7672 goto error;
7673 }
7674 out = PyBytes_AS_STRING(*outbytes) + offset;
7675 }
7676 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7677 out += outsize;
7678 }
7679 else {
7680 Py_ssize_t i;
7681 enum PyUnicode_Kind kind;
7682 void *data;
7683
Benjamin Petersonbac79492012-01-14 13:34:47 -05007684 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007685 Py_DECREF(rep);
7686 goto error;
7687 }
7688
7689 outsize = PyUnicode_GET_LENGTH(rep);
7690 if (outsize != 1) {
7691 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7692 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7693 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7694 Py_DECREF(rep);
7695 goto error;
7696 }
7697 out = PyBytes_AS_STRING(*outbytes) + offset;
7698 }
7699 kind = PyUnicode_KIND(rep);
7700 data = PyUnicode_DATA(rep);
7701 for (i=0; i < outsize; i++) {
7702 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7703 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007704 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007705 encoding, unicode,
7706 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007707 "unable to encode error handler result to ASCII");
7708 Py_DECREF(rep);
7709 goto error;
7710 }
7711 *out = (unsigned char)ch;
7712 out++;
7713 }
7714 }
7715 Py_DECREF(rep);
7716 }
7717 /* write a NUL byte */
7718 *out = 0;
7719 outsize = out - PyBytes_AS_STRING(*outbytes);
7720 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7721 if (_PyBytes_Resize(outbytes, outsize) < 0)
7722 goto error;
7723 ret = 0;
7724
7725error:
7726 Py_XDECREF(encoding_obj);
7727 Py_XDECREF(errorHandler);
7728 Py_XDECREF(exc);
7729 return ret;
7730}
7731
Victor Stinner3a50e702011-10-18 21:21:00 +02007732static PyObject *
7733encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007734 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007735 const char *errors)
7736{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007737 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007738 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007739 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007740 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007741
Victor Stinner29dacf22015-01-26 16:41:32 +01007742 if (!PyUnicode_Check(unicode)) {
7743 PyErr_BadArgument();
7744 return NULL;
7745 }
7746
Benjamin Petersonbac79492012-01-14 13:34:47 -05007747 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007748 return NULL;
7749 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007750
Victor Stinner3a50e702011-10-18 21:21:00 +02007751 if (code_page < 0) {
7752 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7753 return NULL;
7754 }
7755
Martin v. Löwis3d325192011-11-04 18:23:06 +01007756 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007757 return PyBytes_FromStringAndSize(NULL, 0);
7758
Victor Stinner7581cef2011-11-03 22:32:33 +01007759 offset = 0;
7760 do
7761 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007762#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007763 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007764 chunks. */
7765 if (len > INT_MAX/2) {
7766 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007767 done = 0;
7768 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007769 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007770#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007771 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007772 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007773 done = 1;
7774 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007775
Victor Stinner76a31a62011-11-04 00:05:13 +01007776 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007777 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007778 errors);
7779 if (ret == -2)
7780 ret = encode_code_page_errors(code_page, &outbytes,
7781 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007782 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007783 if (ret < 0) {
7784 Py_XDECREF(outbytes);
7785 return NULL;
7786 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007787
Victor Stinner7581cef2011-11-03 22:32:33 +01007788 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007789 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007790 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007791
Victor Stinner3a50e702011-10-18 21:21:00 +02007792 return outbytes;
7793}
7794
7795PyObject *
7796PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7797 Py_ssize_t size,
7798 const char *errors)
7799{
Victor Stinner7581cef2011-11-03 22:32:33 +01007800 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007801 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007802 if (unicode == NULL)
7803 return NULL;
7804 res = encode_code_page(CP_ACP, unicode, errors);
7805 Py_DECREF(unicode);
7806 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007807}
7808
7809PyObject *
7810PyUnicode_EncodeCodePage(int code_page,
7811 PyObject *unicode,
7812 const char *errors)
7813{
Victor Stinner7581cef2011-11-03 22:32:33 +01007814 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007815}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007816
Alexander Belopolsky40018472011-02-26 01:02:56 +00007817PyObject *
7818PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007819{
Victor Stinner7581cef2011-11-03 22:32:33 +01007820 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007821}
7822
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007823#undef NEED_RETRY
7824
Steve Dowercc16be82016-09-08 10:35:16 -07007825#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007826
Guido van Rossumd57fd912000-03-10 22:53:23 +00007827/* --- Character Mapping Codec -------------------------------------------- */
7828
Victor Stinnerfb161b12013-04-18 01:44:27 +02007829static int
7830charmap_decode_string(const char *s,
7831 Py_ssize_t size,
7832 PyObject *mapping,
7833 const char *errors,
7834 _PyUnicodeWriter *writer)
7835{
7836 const char *starts = s;
7837 const char *e;
7838 Py_ssize_t startinpos, endinpos;
7839 PyObject *errorHandler = NULL, *exc = NULL;
7840 Py_ssize_t maplen;
7841 enum PyUnicode_Kind mapkind;
7842 void *mapdata;
7843 Py_UCS4 x;
7844 unsigned char ch;
7845
7846 if (PyUnicode_READY(mapping) == -1)
7847 return -1;
7848
7849 maplen = PyUnicode_GET_LENGTH(mapping);
7850 mapdata = PyUnicode_DATA(mapping);
7851 mapkind = PyUnicode_KIND(mapping);
7852
7853 e = s + size;
7854
7855 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7856 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7857 * is disabled in encoding aliases, latin1 is preferred because
7858 * its implementation is faster. */
7859 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7860 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7861 Py_UCS4 maxchar = writer->maxchar;
7862
7863 assert (writer->kind == PyUnicode_1BYTE_KIND);
7864 while (s < e) {
7865 ch = *s;
7866 x = mapdata_ucs1[ch];
7867 if (x > maxchar) {
7868 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7869 goto onError;
7870 maxchar = writer->maxchar;
7871 outdata = (Py_UCS1 *)writer->data;
7872 }
7873 outdata[writer->pos] = x;
7874 writer->pos++;
7875 ++s;
7876 }
7877 return 0;
7878 }
7879
7880 while (s < e) {
7881 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7882 enum PyUnicode_Kind outkind = writer->kind;
7883 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7884 if (outkind == PyUnicode_1BYTE_KIND) {
7885 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7886 Py_UCS4 maxchar = writer->maxchar;
7887 while (s < e) {
7888 ch = *s;
7889 x = mapdata_ucs2[ch];
7890 if (x > maxchar)
7891 goto Error;
7892 outdata[writer->pos] = x;
7893 writer->pos++;
7894 ++s;
7895 }
7896 break;
7897 }
7898 else if (outkind == PyUnicode_2BYTE_KIND) {
7899 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7900 while (s < e) {
7901 ch = *s;
7902 x = mapdata_ucs2[ch];
7903 if (x == 0xFFFE)
7904 goto Error;
7905 outdata[writer->pos] = x;
7906 writer->pos++;
7907 ++s;
7908 }
7909 break;
7910 }
7911 }
7912 ch = *s;
7913
7914 if (ch < maplen)
7915 x = PyUnicode_READ(mapkind, mapdata, ch);
7916 else
7917 x = 0xfffe; /* invalid value */
7918Error:
7919 if (x == 0xfffe)
7920 {
7921 /* undefined mapping */
7922 startinpos = s-starts;
7923 endinpos = startinpos+1;
7924 if (unicode_decode_call_errorhandler_writer(
7925 errors, &errorHandler,
7926 "charmap", "character maps to <undefined>",
7927 &starts, &e, &startinpos, &endinpos, &exc, &s,
7928 writer)) {
7929 goto onError;
7930 }
7931 continue;
7932 }
7933
7934 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7935 goto onError;
7936 ++s;
7937 }
7938 Py_XDECREF(errorHandler);
7939 Py_XDECREF(exc);
7940 return 0;
7941
7942onError:
7943 Py_XDECREF(errorHandler);
7944 Py_XDECREF(exc);
7945 return -1;
7946}
7947
7948static int
7949charmap_decode_mapping(const char *s,
7950 Py_ssize_t size,
7951 PyObject *mapping,
7952 const char *errors,
7953 _PyUnicodeWriter *writer)
7954{
7955 const char *starts = s;
7956 const char *e;
7957 Py_ssize_t startinpos, endinpos;
7958 PyObject *errorHandler = NULL, *exc = NULL;
7959 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007960 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007961
7962 e = s + size;
7963
7964 while (s < e) {
7965 ch = *s;
7966
7967 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7968 key = PyLong_FromLong((long)ch);
7969 if (key == NULL)
7970 goto onError;
7971
7972 item = PyObject_GetItem(mapping, key);
7973 Py_DECREF(key);
7974 if (item == NULL) {
7975 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7976 /* No mapping found means: mapping is undefined. */
7977 PyErr_Clear();
7978 goto Undefined;
7979 } else
7980 goto onError;
7981 }
7982
7983 /* Apply mapping */
7984 if (item == Py_None)
7985 goto Undefined;
7986 if (PyLong_Check(item)) {
7987 long value = PyLong_AS_LONG(item);
7988 if (value == 0xFFFE)
7989 goto Undefined;
7990 if (value < 0 || value > MAX_UNICODE) {
7991 PyErr_Format(PyExc_TypeError,
7992 "character mapping must be in range(0x%lx)",
7993 (unsigned long)MAX_UNICODE + 1);
7994 goto onError;
7995 }
7996
7997 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7998 goto onError;
7999 }
8000 else if (PyUnicode_Check(item)) {
8001 if (PyUnicode_READY(item) == -1)
8002 goto onError;
8003 if (PyUnicode_GET_LENGTH(item) == 1) {
8004 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8005 if (value == 0xFFFE)
8006 goto Undefined;
8007 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8008 goto onError;
8009 }
8010 else {
8011 writer->overallocate = 1;
8012 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8013 goto onError;
8014 }
8015 }
8016 else {
8017 /* wrong return value */
8018 PyErr_SetString(PyExc_TypeError,
8019 "character mapping must return integer, None or str");
8020 goto onError;
8021 }
8022 Py_CLEAR(item);
8023 ++s;
8024 continue;
8025
8026Undefined:
8027 /* undefined mapping */
8028 Py_CLEAR(item);
8029 startinpos = s-starts;
8030 endinpos = startinpos+1;
8031 if (unicode_decode_call_errorhandler_writer(
8032 errors, &errorHandler,
8033 "charmap", "character maps to <undefined>",
8034 &starts, &e, &startinpos, &endinpos, &exc, &s,
8035 writer)) {
8036 goto onError;
8037 }
8038 }
8039 Py_XDECREF(errorHandler);
8040 Py_XDECREF(exc);
8041 return 0;
8042
8043onError:
8044 Py_XDECREF(item);
8045 Py_XDECREF(errorHandler);
8046 Py_XDECREF(exc);
8047 return -1;
8048}
8049
Alexander Belopolsky40018472011-02-26 01:02:56 +00008050PyObject *
8051PyUnicode_DecodeCharmap(const char *s,
8052 Py_ssize_t size,
8053 PyObject *mapping,
8054 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008056 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008057
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058 /* Default to Latin-1 */
8059 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008060 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008061
Guido van Rossumd57fd912000-03-10 22:53:23 +00008062 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008063 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008064 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008065 writer.min_length = size;
8066 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008067 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008068
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008069 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008070 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8071 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008072 }
8073 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008074 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8075 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008076 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008077 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008078
Benjamin Peterson29060642009-01-31 22:14:21 +00008079 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008080 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081 return NULL;
8082}
8083
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008084/* Charmap encoding: the lookup table */
8085
Alexander Belopolsky40018472011-02-26 01:02:56 +00008086struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008087 PyObject_HEAD
8088 unsigned char level1[32];
8089 int count2, count3;
8090 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008091};
8092
8093static PyObject*
8094encoding_map_size(PyObject *obj, PyObject* args)
8095{
8096 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008097 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008098 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008099}
8100
8101static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008102 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008103 PyDoc_STR("Return the size (in bytes) of this object") },
8104 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008105};
8106
8107static void
8108encoding_map_dealloc(PyObject* o)
8109{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008110 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008111}
8112
8113static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008114 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008115 "EncodingMap", /*tp_name*/
8116 sizeof(struct encoding_map), /*tp_basicsize*/
8117 0, /*tp_itemsize*/
8118 /* methods */
8119 encoding_map_dealloc, /*tp_dealloc*/
8120 0, /*tp_print*/
8121 0, /*tp_getattr*/
8122 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008123 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008124 0, /*tp_repr*/
8125 0, /*tp_as_number*/
8126 0, /*tp_as_sequence*/
8127 0, /*tp_as_mapping*/
8128 0, /*tp_hash*/
8129 0, /*tp_call*/
8130 0, /*tp_str*/
8131 0, /*tp_getattro*/
8132 0, /*tp_setattro*/
8133 0, /*tp_as_buffer*/
8134 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8135 0, /*tp_doc*/
8136 0, /*tp_traverse*/
8137 0, /*tp_clear*/
8138 0, /*tp_richcompare*/
8139 0, /*tp_weaklistoffset*/
8140 0, /*tp_iter*/
8141 0, /*tp_iternext*/
8142 encoding_map_methods, /*tp_methods*/
8143 0, /*tp_members*/
8144 0, /*tp_getset*/
8145 0, /*tp_base*/
8146 0, /*tp_dict*/
8147 0, /*tp_descr_get*/
8148 0, /*tp_descr_set*/
8149 0, /*tp_dictoffset*/
8150 0, /*tp_init*/
8151 0, /*tp_alloc*/
8152 0, /*tp_new*/
8153 0, /*tp_free*/
8154 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008155};
8156
8157PyObject*
8158PyUnicode_BuildEncodingMap(PyObject* string)
8159{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008160 PyObject *result;
8161 struct encoding_map *mresult;
8162 int i;
8163 int need_dict = 0;
8164 unsigned char level1[32];
8165 unsigned char level2[512];
8166 unsigned char *mlevel1, *mlevel2, *mlevel3;
8167 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008168 int kind;
8169 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008170 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008171 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008172
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008173 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008174 PyErr_BadArgument();
8175 return NULL;
8176 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008177 kind = PyUnicode_KIND(string);
8178 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008179 length = PyUnicode_GET_LENGTH(string);
8180 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008181 memset(level1, 0xFF, sizeof level1);
8182 memset(level2, 0xFF, sizeof level2);
8183
8184 /* If there isn't a one-to-one mapping of NULL to \0,
8185 or if there are non-BMP characters, we need to use
8186 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008187 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008188 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008189 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008190 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008191 ch = PyUnicode_READ(kind, data, i);
8192 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008193 need_dict = 1;
8194 break;
8195 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008196 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008197 /* unmapped character */
8198 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008199 l1 = ch >> 11;
8200 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008201 if (level1[l1] == 0xFF)
8202 level1[l1] = count2++;
8203 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008204 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008205 }
8206
8207 if (count2 >= 0xFF || count3 >= 0xFF)
8208 need_dict = 1;
8209
8210 if (need_dict) {
8211 PyObject *result = PyDict_New();
8212 PyObject *key, *value;
8213 if (!result)
8214 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008215 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008216 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008217 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008218 if (!key || !value)
8219 goto failed1;
8220 if (PyDict_SetItem(result, key, value) == -1)
8221 goto failed1;
8222 Py_DECREF(key);
8223 Py_DECREF(value);
8224 }
8225 return result;
8226 failed1:
8227 Py_XDECREF(key);
8228 Py_XDECREF(value);
8229 Py_DECREF(result);
8230 return NULL;
8231 }
8232
8233 /* Create a three-level trie */
8234 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8235 16*count2 + 128*count3 - 1);
8236 if (!result)
8237 return PyErr_NoMemory();
8238 PyObject_Init(result, &EncodingMapType);
8239 mresult = (struct encoding_map*)result;
8240 mresult->count2 = count2;
8241 mresult->count3 = count3;
8242 mlevel1 = mresult->level1;
8243 mlevel2 = mresult->level23;
8244 mlevel3 = mresult->level23 + 16*count2;
8245 memcpy(mlevel1, level1, 32);
8246 memset(mlevel2, 0xFF, 16*count2);
8247 memset(mlevel3, 0, 128*count3);
8248 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008249 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008250 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008251 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8252 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008253 /* unmapped character */
8254 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008255 o1 = ch>>11;
8256 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008257 i2 = 16*mlevel1[o1] + o2;
8258 if (mlevel2[i2] == 0xFF)
8259 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008260 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008261 i3 = 128*mlevel2[i2] + o3;
8262 mlevel3[i3] = i;
8263 }
8264 return result;
8265}
8266
8267static int
Victor Stinner22168992011-11-20 17:09:18 +01008268encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008269{
8270 struct encoding_map *map = (struct encoding_map*)mapping;
8271 int l1 = c>>11;
8272 int l2 = (c>>7) & 0xF;
8273 int l3 = c & 0x7F;
8274 int i;
8275
Victor Stinner22168992011-11-20 17:09:18 +01008276 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008278 if (c == 0)
8279 return 0;
8280 /* level 1*/
8281 i = map->level1[l1];
8282 if (i == 0xFF) {
8283 return -1;
8284 }
8285 /* level 2*/
8286 i = map->level23[16*i+l2];
8287 if (i == 0xFF) {
8288 return -1;
8289 }
8290 /* level 3 */
8291 i = map->level23[16*map->count2 + 128*i + l3];
8292 if (i == 0) {
8293 return -1;
8294 }
8295 return i;
8296}
8297
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008298/* Lookup the character ch in the mapping. If the character
8299 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008300 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008301static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008302charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008303{
Christian Heimes217cfd12007-12-02 14:31:20 +00008304 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008305 PyObject *x;
8306
8307 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008308 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008309 x = PyObject_GetItem(mapping, w);
8310 Py_DECREF(w);
8311 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008312 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8313 /* No mapping found means: mapping is undefined. */
8314 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008315 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008316 } else
8317 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008318 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008319 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008320 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008321 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008322 long value = PyLong_AS_LONG(x);
8323 if (value < 0 || value > 255) {
8324 PyErr_SetString(PyExc_TypeError,
8325 "character mapping must be in range(256)");
8326 Py_DECREF(x);
8327 return NULL;
8328 }
8329 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008331 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008332 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 /* wrong return value */
8335 PyErr_Format(PyExc_TypeError,
8336 "character mapping must return integer, bytes or None, not %.400s",
8337 x->ob_type->tp_name);
8338 Py_DECREF(x);
8339 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340 }
8341}
8342
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008343static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008344charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008345{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008346 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8347 /* exponentially overallocate to minimize reallocations */
8348 if (requiredsize < 2*outsize)
8349 requiredsize = 2*outsize;
8350 if (_PyBytes_Resize(outobj, requiredsize))
8351 return -1;
8352 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008353}
8354
Benjamin Peterson14339b62009-01-31 16:36:08 +00008355typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008357} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008358/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008359 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008360 space is available. Return a new reference to the object that
8361 was put in the output buffer, or Py_None, if the mapping was undefined
8362 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008363 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008364static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008365charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008366 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008367{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008368 PyObject *rep;
8369 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008370 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008371
Christian Heimes90aa7642007-12-19 02:45:37 +00008372 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008373 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008375 if (res == -1)
8376 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 if (outsize<requiredsize)
8378 if (charmapencode_resize(outobj, outpos, requiredsize))
8379 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008380 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 outstart[(*outpos)++] = (char)res;
8382 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008383 }
8384
8385 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008386 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008388 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 Py_DECREF(rep);
8390 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008391 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008392 if (PyLong_Check(rep)) {
8393 Py_ssize_t requiredsize = *outpos+1;
8394 if (outsize<requiredsize)
8395 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8396 Py_DECREF(rep);
8397 return enc_EXCEPTION;
8398 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008399 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008400 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008401 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 else {
8403 const char *repchars = PyBytes_AS_STRING(rep);
8404 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8405 Py_ssize_t requiredsize = *outpos+repsize;
8406 if (outsize<requiredsize)
8407 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8408 Py_DECREF(rep);
8409 return enc_EXCEPTION;
8410 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008411 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 memcpy(outstart + *outpos, repchars, repsize);
8413 *outpos += repsize;
8414 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008415 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008416 Py_DECREF(rep);
8417 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008418}
8419
8420/* handle an error in PyUnicode_EncodeCharmap
8421 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008422static int
8423charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008424 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008425 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008426 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008427 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008428{
8429 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008430 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008431 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008432 enum PyUnicode_Kind kind;
8433 void *data;
8434 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008435 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008436 Py_ssize_t collstartpos = *inpos;
8437 Py_ssize_t collendpos = *inpos+1;
8438 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008439 const char *encoding = "charmap";
8440 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008441 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008442 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008443 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008444
Benjamin Petersonbac79492012-01-14 13:34:47 -05008445 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008446 return -1;
8447 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008448 /* find all unencodable characters */
8449 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008450 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008451 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008452 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008453 val = encoding_map_lookup(ch, mapping);
8454 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 break;
8456 ++collendpos;
8457 continue;
8458 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008459
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008460 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8461 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 if (rep==NULL)
8463 return -1;
8464 else if (rep!=Py_None) {
8465 Py_DECREF(rep);
8466 break;
8467 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008468 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008470 }
8471 /* cache callback name lookup
8472 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008473 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008474 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008475
8476 switch (*error_handler) {
8477 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008478 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008479 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008480
8481 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008482 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008483 x = charmapencode_output('?', mapping, res, respos);
8484 if (x==enc_EXCEPTION) {
8485 return -1;
8486 }
8487 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008488 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008489 return -1;
8490 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008491 }
8492 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008493 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008494 *inpos = collendpos;
8495 break;
Victor Stinner50149202015-09-22 00:26:54 +02008496
8497 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008498 /* generate replacement (temporarily (mis)uses p) */
8499 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008500 char buffer[2+29+1+1];
8501 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008502 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 for (cp = buffer; *cp; ++cp) {
8504 x = charmapencode_output(*cp, mapping, res, respos);
8505 if (x==enc_EXCEPTION)
8506 return -1;
8507 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008508 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 return -1;
8510 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008511 }
8512 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008513 *inpos = collendpos;
8514 break;
Victor Stinner50149202015-09-22 00:26:54 +02008515
Benjamin Peterson14339b62009-01-31 16:36:08 +00008516 default:
Victor Stinner50149202015-09-22 00:26:54 +02008517 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008518 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008519 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008520 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008521 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008522 if (PyBytes_Check(repunicode)) {
8523 /* Directly copy bytes result to output. */
8524 Py_ssize_t outsize = PyBytes_Size(*res);
8525 Py_ssize_t requiredsize;
8526 repsize = PyBytes_Size(repunicode);
8527 requiredsize = *respos + repsize;
8528 if (requiredsize > outsize)
8529 /* Make room for all additional bytes. */
8530 if (charmapencode_resize(res, respos, requiredsize)) {
8531 Py_DECREF(repunicode);
8532 return -1;
8533 }
8534 memcpy(PyBytes_AsString(*res) + *respos,
8535 PyBytes_AsString(repunicode), repsize);
8536 *respos += repsize;
8537 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008538 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008539 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008540 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008541 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008542 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008543 Py_DECREF(repunicode);
8544 return -1;
8545 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008546 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008547 data = PyUnicode_DATA(repunicode);
8548 kind = PyUnicode_KIND(repunicode);
8549 for (index = 0; index < repsize; index++) {
8550 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8551 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008552 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008553 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008554 return -1;
8555 }
8556 else if (x==enc_FAILED) {
8557 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008558 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 return -1;
8560 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008561 }
8562 *inpos = newpos;
8563 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008564 }
8565 return 0;
8566}
8567
Alexander Belopolsky40018472011-02-26 01:02:56 +00008568PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008569_PyUnicode_EncodeCharmap(PyObject *unicode,
8570 PyObject *mapping,
8571 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008573 /* output object */
8574 PyObject *res = NULL;
8575 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008576 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008577 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008578 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008579 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008580 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008581 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008582 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008583 void *data;
8584 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585
Benjamin Petersonbac79492012-01-14 13:34:47 -05008586 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008587 return NULL;
8588 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008589 data = PyUnicode_DATA(unicode);
8590 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008591
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592 /* Default to Latin-1 */
8593 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008594 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008596 /* allocate enough for a simple encoding without
8597 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008598 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008599 if (res == NULL)
8600 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008601 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008603
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008604 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008605 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008606 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008607 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008608 if (x==enc_EXCEPTION) /* error */
8609 goto onError;
8610 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008611 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008612 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008613 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008614 &res, &respos)) {
8615 goto onError;
8616 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008617 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008618 else
8619 /* done with this character => adjust input position */
8620 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008623 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008624 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008625 if (_PyBytes_Resize(&res, respos) < 0)
8626 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008627
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008628 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008629 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008630 return res;
8631
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008633 Py_XDECREF(res);
8634 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008635 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636 return NULL;
8637}
8638
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008639/* Deprecated */
8640PyObject *
8641PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8642 Py_ssize_t size,
8643 PyObject *mapping,
8644 const char *errors)
8645{
8646 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008647 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008648 if (unicode == NULL)
8649 return NULL;
8650 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8651 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008652 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008653}
8654
Alexander Belopolsky40018472011-02-26 01:02:56 +00008655PyObject *
8656PyUnicode_AsCharmapString(PyObject *unicode,
8657 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658{
8659 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008660 PyErr_BadArgument();
8661 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008663 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008664}
8665
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008666/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008667static void
8668make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008669 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008670 Py_ssize_t startpos, Py_ssize_t endpos,
8671 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008672{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008673 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008674 *exceptionObject = _PyUnicodeTranslateError_Create(
8675 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008676 }
8677 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8679 goto onError;
8680 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8681 goto onError;
8682 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8683 goto onError;
8684 return;
8685 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008686 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008687 }
8688}
8689
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008690/* error handling callback helper:
8691 build arguments, call the callback and check the arguments,
8692 put the result into newpos and return the replacement string, which
8693 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008694static PyObject *
8695unicode_translate_call_errorhandler(const char *errors,
8696 PyObject **errorHandler,
8697 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008698 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008699 Py_ssize_t startpos, Py_ssize_t endpos,
8700 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008701{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008702 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008703
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008704 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008705 PyObject *restuple;
8706 PyObject *resunicode;
8707
8708 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008709 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008710 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008711 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008712 }
8713
8714 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008715 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008716 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008718
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008719 restuple = PyObject_CallFunctionObjArgs(
8720 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008721 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008723 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008724 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 Py_DECREF(restuple);
8726 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008727 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008728 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 &resunicode, &i_newpos)) {
8730 Py_DECREF(restuple);
8731 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008732 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008733 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008734 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008735 else
8736 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008737 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008738 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008739 Py_DECREF(restuple);
8740 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008741 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008742 Py_INCREF(resunicode);
8743 Py_DECREF(restuple);
8744 return resunicode;
8745}
8746
8747/* Lookup the character ch in the mapping and put the result in result,
8748 which must be decrefed by the caller.
8749 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008750static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008751charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008752{
Christian Heimes217cfd12007-12-02 14:31:20 +00008753 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008754 PyObject *x;
8755
8756 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008757 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008758 x = PyObject_GetItem(mapping, w);
8759 Py_DECREF(w);
8760 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008761 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8762 /* No mapping found means: use 1:1 mapping. */
8763 PyErr_Clear();
8764 *result = NULL;
8765 return 0;
8766 } else
8767 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008768 }
8769 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008770 *result = x;
8771 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008772 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008773 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008774 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008775 if (value < 0 || value > MAX_UNICODE) {
8776 PyErr_Format(PyExc_ValueError,
8777 "character mapping must be in range(0x%x)",
8778 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008779 Py_DECREF(x);
8780 return -1;
8781 }
8782 *result = x;
8783 return 0;
8784 }
8785 else if (PyUnicode_Check(x)) {
8786 *result = x;
8787 return 0;
8788 }
8789 else {
8790 /* wrong return value */
8791 PyErr_SetString(PyExc_TypeError,
8792 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008793 Py_DECREF(x);
8794 return -1;
8795 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008796}
Victor Stinner1194ea02014-04-04 19:37:40 +02008797
8798/* lookup the character, write the result into the writer.
8799 Return 1 if the result was written into the writer, return 0 if the mapping
8800 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008801static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008802charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8803 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008804{
Victor Stinner1194ea02014-04-04 19:37:40 +02008805 PyObject *item;
8806
8807 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008808 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008809
8810 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008811 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008812 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008813 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008814 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008815 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008816 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008817
8818 if (item == Py_None) {
8819 Py_DECREF(item);
8820 return 0;
8821 }
8822
8823 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008824 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8825 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8826 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008827 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8828 Py_DECREF(item);
8829 return -1;
8830 }
8831 Py_DECREF(item);
8832 return 1;
8833 }
8834
8835 if (!PyUnicode_Check(item)) {
8836 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008837 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008838 }
8839
8840 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8841 Py_DECREF(item);
8842 return -1;
8843 }
8844
8845 Py_DECREF(item);
8846 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008847}
8848
Victor Stinner89a76ab2014-04-05 11:44:04 +02008849static int
8850unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8851 Py_UCS1 *translate)
8852{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008853 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008854 int ret = 0;
8855
Victor Stinner89a76ab2014-04-05 11:44:04 +02008856 if (charmaptranslate_lookup(ch, mapping, &item)) {
8857 return -1;
8858 }
8859
8860 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008861 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008862 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008863 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008864 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008865 /* not found => default to 1:1 mapping */
8866 translate[ch] = ch;
8867 return 1;
8868 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008869 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008870 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008871 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8872 used it */
8873 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008874 /* invalid character or character outside ASCII:
8875 skip the fast translate */
8876 goto exit;
8877 }
8878 translate[ch] = (Py_UCS1)replace;
8879 }
8880 else if (PyUnicode_Check(item)) {
8881 Py_UCS4 replace;
8882
8883 if (PyUnicode_READY(item) == -1) {
8884 Py_DECREF(item);
8885 return -1;
8886 }
8887 if (PyUnicode_GET_LENGTH(item) != 1)
8888 goto exit;
8889
8890 replace = PyUnicode_READ_CHAR(item, 0);
8891 if (replace > 127)
8892 goto exit;
8893 translate[ch] = (Py_UCS1)replace;
8894 }
8895 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008896 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008897 goto exit;
8898 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008899 ret = 1;
8900
Benjamin Peterson1365de72014-04-07 20:15:41 -04008901 exit:
8902 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008903 return ret;
8904}
8905
8906/* Fast path for ascii => ascii translation. Return 1 if the whole string
8907 was translated into writer, return 0 if the input string was partially
8908 translated into writer, raise an exception and return -1 on error. */
8909static int
8910unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008911 _PyUnicodeWriter *writer, int ignore,
8912 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008913{
Victor Stinner872b2912014-04-05 14:27:07 +02008914 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008915 Py_ssize_t len;
8916 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008917 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008918
Victor Stinner89a76ab2014-04-05 11:44:04 +02008919 len = PyUnicode_GET_LENGTH(input);
8920
Victor Stinner872b2912014-04-05 14:27:07 +02008921 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008922
8923 in = PyUnicode_1BYTE_DATA(input);
8924 end = in + len;
8925
8926 assert(PyUnicode_IS_ASCII(writer->buffer));
8927 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8928 out = PyUnicode_1BYTE_DATA(writer->buffer);
8929
Victor Stinner872b2912014-04-05 14:27:07 +02008930 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008931 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008932 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008933 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008934 int translate = unicode_fast_translate_lookup(mapping, ch,
8935 ascii_table);
8936 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008937 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008938 if (translate == 0)
8939 goto exit;
8940 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008941 }
Victor Stinner872b2912014-04-05 14:27:07 +02008942 if (ch2 == 0xfe) {
8943 if (ignore)
8944 continue;
8945 goto exit;
8946 }
8947 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008948 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008949 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008950 }
Victor Stinner872b2912014-04-05 14:27:07 +02008951 res = 1;
8952
8953exit:
8954 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008955 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008956 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008957}
8958
Victor Stinner3222da22015-10-01 22:07:32 +02008959static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960_PyUnicode_TranslateCharmap(PyObject *input,
8961 PyObject *mapping,
8962 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008963{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008964 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008965 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008966 Py_ssize_t size, i;
8967 int kind;
8968 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008969 _PyUnicodeWriter writer;
8970 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008971 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008972 PyObject *errorHandler = NULL;
8973 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008974 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008975 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008976
Guido van Rossumd57fd912000-03-10 22:53:23 +00008977 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008978 PyErr_BadArgument();
8979 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008980 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008982 if (PyUnicode_READY(input) == -1)
8983 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008984 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008985 kind = PyUnicode_KIND(input);
8986 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008987
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008988 if (size == 0)
8989 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008990
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008991 /* allocate enough for a simple 1:1 translation without
8992 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008993 _PyUnicodeWriter_Init(&writer);
8994 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008995 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008996
Victor Stinner872b2912014-04-05 14:27:07 +02008997 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8998
Victor Stinner33798672016-03-01 21:59:58 +01008999 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009000 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009001 if (PyUnicode_IS_ASCII(input)) {
9002 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9003 if (res < 0) {
9004 _PyUnicodeWriter_Dealloc(&writer);
9005 return NULL;
9006 }
9007 if (res == 1)
9008 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009009 }
Victor Stinner33798672016-03-01 21:59:58 +01009010 else {
9011 i = 0;
9012 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009014 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009015 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009016 int translate;
9017 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9018 Py_ssize_t newpos;
9019 /* startpos for collecting untranslatable chars */
9020 Py_ssize_t collstart;
9021 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009022 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009023
Victor Stinner1194ea02014-04-04 19:37:40 +02009024 ch = PyUnicode_READ(kind, data, i);
9025 translate = charmaptranslate_output(ch, mapping, &writer);
9026 if (translate < 0)
9027 goto onError;
9028
9029 if (translate != 0) {
9030 /* it worked => adjust input pointer */
9031 ++i;
9032 continue;
9033 }
9034
9035 /* untranslatable character */
9036 collstart = i;
9037 collend = i+1;
9038
9039 /* find all untranslatable characters */
9040 while (collend < size) {
9041 PyObject *x;
9042 ch = PyUnicode_READ(kind, data, collend);
9043 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009044 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009045 Py_XDECREF(x);
9046 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009047 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009048 ++collend;
9049 }
9050
9051 if (ignore) {
9052 i = collend;
9053 }
9054 else {
9055 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9056 reason, input, &exc,
9057 collstart, collend, &newpos);
9058 if (repunicode == NULL)
9059 goto onError;
9060 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009061 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009062 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009063 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009064 Py_DECREF(repunicode);
9065 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009066 }
9067 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009068 Py_XDECREF(exc);
9069 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009070 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009071
Benjamin Peterson29060642009-01-31 22:14:21 +00009072 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009073 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009074 Py_XDECREF(exc);
9075 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009076 return NULL;
9077}
9078
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009079/* Deprecated. Use PyUnicode_Translate instead. */
9080PyObject *
9081PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9082 Py_ssize_t size,
9083 PyObject *mapping,
9084 const char *errors)
9085{
Christian Heimes5f520f42012-09-11 14:03:25 +02009086 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009087 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009088 if (!unicode)
9089 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009090 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9091 Py_DECREF(unicode);
9092 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009093}
9094
Alexander Belopolsky40018472011-02-26 01:02:56 +00009095PyObject *
9096PyUnicode_Translate(PyObject *str,
9097 PyObject *mapping,
9098 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009099{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009100 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009101 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009102 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009103}
Tim Petersced69f82003-09-16 20:30:58 +00009104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009105PyObject *
9106_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9107{
9108 if (!PyUnicode_Check(unicode)) {
9109 PyErr_BadInternalCall();
9110 return NULL;
9111 }
9112 if (PyUnicode_READY(unicode) == -1)
9113 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009114 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009115 /* If the string is already ASCII, just return the same string */
9116 Py_INCREF(unicode);
9117 return unicode;
9118 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009119
9120 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9121 PyObject *result = PyUnicode_New(len, 127);
9122 if (result == NULL) {
9123 return NULL;
9124 }
9125
9126 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9127 int kind = PyUnicode_KIND(unicode);
9128 const void *data = PyUnicode_DATA(unicode);
9129 Py_ssize_t i;
9130 for (i = 0; i < len; ++i) {
9131 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9132 if (ch < 127) {
9133 out[i] = ch;
9134 }
9135 else if (Py_UNICODE_ISSPACE(ch)) {
9136 out[i] = ' ';
9137 }
9138 else {
9139 int decimal = Py_UNICODE_TODECIMAL(ch);
9140 if (decimal < 0) {
9141 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009142 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009143 _PyUnicode_LENGTH(result) = i + 1;
9144 break;
9145 }
9146 out[i] = '0' + decimal;
9147 }
9148 }
9149
INADA Naoki16dfca42018-07-14 12:06:43 +09009150 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009151 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009152}
9153
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009154PyObject *
9155PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9156 Py_ssize_t length)
9157{
Victor Stinnerf0124502011-11-21 23:12:56 +01009158 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009159 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009160 Py_UCS4 maxchar;
9161 enum PyUnicode_Kind kind;
9162 void *data;
9163
Victor Stinner99d7ad02012-02-22 13:37:39 +01009164 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009165 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009166 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009167 if (ch > 127) {
9168 int decimal = Py_UNICODE_TODECIMAL(ch);
9169 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009170 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009171 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009172 }
9173 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009174
9175 /* Copy to a new string */
9176 decimal = PyUnicode_New(length, maxchar);
9177 if (decimal == NULL)
9178 return decimal;
9179 kind = PyUnicode_KIND(decimal);
9180 data = PyUnicode_DATA(decimal);
9181 /* Iterate over code points */
9182 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009183 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009184 if (ch > 127) {
9185 int decimal = Py_UNICODE_TODECIMAL(ch);
9186 if (decimal >= 0)
9187 ch = '0' + decimal;
9188 }
9189 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009190 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009191 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009192}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009193/* --- Decimal Encoder ---------------------------------------------------- */
9194
Alexander Belopolsky40018472011-02-26 01:02:56 +00009195int
9196PyUnicode_EncodeDecimal(Py_UNICODE *s,
9197 Py_ssize_t length,
9198 char *output,
9199 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009200{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009201 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009202 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009203 enum PyUnicode_Kind kind;
9204 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009205
9206 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009207 PyErr_BadArgument();
9208 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009209 }
9210
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009211 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009212 if (unicode == NULL)
9213 return -1;
9214
Victor Stinner42bf7752011-11-21 22:52:58 +01009215 kind = PyUnicode_KIND(unicode);
9216 data = PyUnicode_DATA(unicode);
9217
Victor Stinnerb84d7232011-11-22 01:50:07 +01009218 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009219 PyObject *exc;
9220 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009221 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009222 Py_ssize_t startpos;
9223
9224 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009225
Benjamin Peterson29060642009-01-31 22:14:21 +00009226 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009227 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009228 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009229 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009230 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009231 decimal = Py_UNICODE_TODECIMAL(ch);
9232 if (decimal >= 0) {
9233 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009234 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009235 continue;
9236 }
9237 if (0 < ch && ch < 256) {
9238 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009239 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009240 continue;
9241 }
Victor Stinner6345be92011-11-25 20:09:01 +01009242
Victor Stinner42bf7752011-11-21 22:52:58 +01009243 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009244 exc = NULL;
9245 raise_encode_exception(&exc, "decimal", unicode,
9246 startpos, startpos+1,
9247 "invalid decimal Unicode string");
9248 Py_XDECREF(exc);
9249 Py_DECREF(unicode);
9250 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009251 }
9252 /* 0-terminate the output string */
9253 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009254 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009255 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009256}
9257
Guido van Rossumd57fd912000-03-10 22:53:23 +00009258/* --- Helpers ------------------------------------------------------------ */
9259
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009260/* helper macro to fixup start/end slice values */
9261#define ADJUST_INDICES(start, end, len) \
9262 if (end > len) \
9263 end = len; \
9264 else if (end < 0) { \
9265 end += len; \
9266 if (end < 0) \
9267 end = 0; \
9268 } \
9269 if (start < 0) { \
9270 start += len; \
9271 if (start < 0) \
9272 start = 0; \
9273 }
9274
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009276any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009278 Py_ssize_t end,
9279 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009280{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009281 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009282 void *buf1, *buf2;
9283 Py_ssize_t len1, len2, result;
9284
9285 kind1 = PyUnicode_KIND(s1);
9286 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009287 if (kind1 < kind2)
9288 return -1;
9289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009290 len1 = PyUnicode_GET_LENGTH(s1);
9291 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009292 ADJUST_INDICES(start, end, len1);
9293 if (end - start < len2)
9294 return -1;
9295
9296 buf1 = PyUnicode_DATA(s1);
9297 buf2 = PyUnicode_DATA(s2);
9298 if (len2 == 1) {
9299 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9300 result = findchar((const char *)buf1 + kind1*start,
9301 kind1, end - start, ch, direction);
9302 if (result == -1)
9303 return -1;
9304 else
9305 return start + result;
9306 }
9307
9308 if (kind2 != kind1) {
9309 buf2 = _PyUnicode_AsKind(s2, kind1);
9310 if (!buf2)
9311 return -2;
9312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313
Victor Stinner794d5672011-10-10 03:21:36 +02009314 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009315 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009316 case PyUnicode_1BYTE_KIND:
9317 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9318 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9319 else
9320 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9321 break;
9322 case PyUnicode_2BYTE_KIND:
9323 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9324 break;
9325 case PyUnicode_4BYTE_KIND:
9326 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9327 break;
9328 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009329 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009330 }
9331 }
9332 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009333 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009334 case PyUnicode_1BYTE_KIND:
9335 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9336 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9337 else
9338 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9339 break;
9340 case PyUnicode_2BYTE_KIND:
9341 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9342 break;
9343 case PyUnicode_4BYTE_KIND:
9344 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9345 break;
9346 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009347 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009348 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009349 }
9350
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009351 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009352 PyMem_Free(buf2);
9353
9354 return result;
9355}
9356
Victor Stinner59423e32018-11-26 13:40:01 +01009357/* _PyUnicode_InsertThousandsGrouping() helper functions */
9358#include "stringlib/localeutil.h"
9359
9360/**
9361 * InsertThousandsGrouping:
9362 * @writer: Unicode writer.
9363 * @n_buffer: Number of characters in @buffer.
9364 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9365 * @d_pos: Start of digits string.
9366 * @n_digits: The number of digits in the string, in which we want
9367 * to put the grouping chars.
9368 * @min_width: The minimum width of the digits in the output string.
9369 * Output will be zero-padded on the left to fill.
9370 * @grouping: see definition in localeconv().
9371 * @thousands_sep: see definition in localeconv().
9372 *
9373 * There are 2 modes: counting and filling. If @writer is NULL,
9374 * we are in counting mode, else filling mode.
9375 * If counting, the required buffer size is returned.
9376 * If filling, we know the buffer will be large enough, so we don't
9377 * need to pass in the buffer size.
9378 * Inserts thousand grouping characters (as defined by grouping and
9379 * thousands_sep) into @writer.
9380 *
9381 * Return value: -1 on error, number of characters otherwise.
9382 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009383Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009384_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009385 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009386 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009387 PyObject *digits,
9388 Py_ssize_t d_pos,
9389 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009390 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009391 const char *grouping,
9392 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009393 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009394{
Victor Stinner59423e32018-11-26 13:40:01 +01009395 if (writer) {
9396 assert(digits != NULL);
9397 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009398 }
9399 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009400 assert(digits == NULL);
9401 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009402 }
Victor Stinner59423e32018-11-26 13:40:01 +01009403 assert(0 <= d_pos);
9404 assert(0 <= n_digits);
9405 assert(0 <= min_width);
9406 assert(grouping != NULL);
9407
9408 if (digits != NULL) {
9409 if (PyUnicode_READY(digits) == -1) {
9410 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009411 }
Victor Stinner59423e32018-11-26 13:40:01 +01009412 }
9413 if (PyUnicode_READY(thousands_sep) == -1) {
9414 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009415 }
9416
Victor Stinner59423e32018-11-26 13:40:01 +01009417 Py_ssize_t count = 0;
9418 Py_ssize_t n_zeros;
9419 int loop_broken = 0;
9420 int use_separator = 0; /* First time through, don't append the
9421 separator. They only go between
9422 groups. */
9423 Py_ssize_t buffer_pos;
9424 Py_ssize_t digits_pos;
9425 Py_ssize_t len;
9426 Py_ssize_t n_chars;
9427 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9428 be looked at */
9429 /* A generator that returns all of the grouping widths, until it
9430 returns 0. */
9431 GroupGenerator groupgen;
9432 GroupGenerator_init(&groupgen, grouping);
9433 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9434
9435 /* if digits are not grouped, thousands separator
9436 should be an empty string */
9437 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9438
9439 digits_pos = d_pos + n_digits;
9440 if (writer) {
9441 buffer_pos = writer->pos + n_buffer;
9442 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9443 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009444 }
Victor Stinner59423e32018-11-26 13:40:01 +01009445 else {
9446 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009447 }
Victor Stinner59423e32018-11-26 13:40:01 +01009448
9449 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009450 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009451 }
Victor Stinner59423e32018-11-26 13:40:01 +01009452
9453 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9454 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9455 n_zeros = Py_MAX(0, len - remaining);
9456 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9457
9458 /* Use n_zero zero's and n_chars chars */
9459
9460 /* Count only, don't do anything. */
9461 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9462
9463 /* Copy into the writer. */
9464 InsertThousandsGrouping_fill(writer, &buffer_pos,
9465 digits, &digits_pos,
9466 n_chars, n_zeros,
9467 use_separator ? thousands_sep : NULL,
9468 thousands_sep_len, maxchar);
9469
9470 /* Use a separator next time. */
9471 use_separator = 1;
9472
9473 remaining -= n_chars;
9474 min_width -= len;
9475
9476 if (remaining <= 0 && min_width <= 0) {
9477 loop_broken = 1;
9478 break;
9479 }
9480 min_width -= thousands_sep_len;
9481 }
9482 if (!loop_broken) {
9483 /* We left the loop without using a break statement. */
9484
9485 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9486 n_zeros = Py_MAX(0, len - remaining);
9487 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9488
9489 /* Use n_zero zero's and n_chars chars */
9490 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9491
9492 /* Copy into the writer. */
9493 InsertThousandsGrouping_fill(writer, &buffer_pos,
9494 digits, &digits_pos,
9495 n_chars, n_zeros,
9496 use_separator ? thousands_sep : NULL,
9497 thousands_sep_len, maxchar);
9498 }
9499 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009500}
9501
9502
Alexander Belopolsky40018472011-02-26 01:02:56 +00009503Py_ssize_t
9504PyUnicode_Count(PyObject *str,
9505 PyObject *substr,
9506 Py_ssize_t start,
9507 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009509 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009510 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009511 void *buf1 = NULL, *buf2 = NULL;
9512 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009513
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009514 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009515 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009516
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009517 kind1 = PyUnicode_KIND(str);
9518 kind2 = PyUnicode_KIND(substr);
9519 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009520 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009521
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009522 len1 = PyUnicode_GET_LENGTH(str);
9523 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009524 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009525 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009526 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009527
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009528 buf1 = PyUnicode_DATA(str);
9529 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009530 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009531 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009532 if (!buf2)
9533 goto onError;
9534 }
9535
9536 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009537 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009538 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009539 result = asciilib_count(
9540 ((Py_UCS1*)buf1) + start, end - start,
9541 buf2, len2, PY_SSIZE_T_MAX
9542 );
9543 else
9544 result = ucs1lib_count(
9545 ((Py_UCS1*)buf1) + start, end - start,
9546 buf2, len2, PY_SSIZE_T_MAX
9547 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009548 break;
9549 case PyUnicode_2BYTE_KIND:
9550 result = ucs2lib_count(
9551 ((Py_UCS2*)buf1) + start, end - start,
9552 buf2, len2, PY_SSIZE_T_MAX
9553 );
9554 break;
9555 case PyUnicode_4BYTE_KIND:
9556 result = ucs4lib_count(
9557 ((Py_UCS4*)buf1) + start, end - start,
9558 buf2, len2, PY_SSIZE_T_MAX
9559 );
9560 break;
9561 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009562 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009563 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009564
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009565 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009566 PyMem_Free(buf2);
9567
Guido van Rossumd57fd912000-03-10 22:53:23 +00009568 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009569 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009570 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009571 PyMem_Free(buf2);
9572 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009573}
9574
Alexander Belopolsky40018472011-02-26 01:02:56 +00009575Py_ssize_t
9576PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009577 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009578 Py_ssize_t start,
9579 Py_ssize_t end,
9580 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009582 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009583 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009584
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009585 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009586}
9587
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009588Py_ssize_t
9589PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9590 Py_ssize_t start, Py_ssize_t end,
9591 int direction)
9592{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009593 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009594 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009595 if (PyUnicode_READY(str) == -1)
9596 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009597 len = PyUnicode_GET_LENGTH(str);
9598 ADJUST_INDICES(start, end, len);
9599 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009600 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009601 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009602 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9603 kind, end-start, ch, direction);
9604 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009605 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009606 else
9607 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009608}
9609
Alexander Belopolsky40018472011-02-26 01:02:56 +00009610static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009611tailmatch(PyObject *self,
9612 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009613 Py_ssize_t start,
9614 Py_ssize_t end,
9615 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009616{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009617 int kind_self;
9618 int kind_sub;
9619 void *data_self;
9620 void *data_sub;
9621 Py_ssize_t offset;
9622 Py_ssize_t i;
9623 Py_ssize_t end_sub;
9624
9625 if (PyUnicode_READY(self) == -1 ||
9626 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009627 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009628
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009629 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9630 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009631 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009632 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009633
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009634 if (PyUnicode_GET_LENGTH(substring) == 0)
9635 return 1;
9636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009637 kind_self = PyUnicode_KIND(self);
9638 data_self = PyUnicode_DATA(self);
9639 kind_sub = PyUnicode_KIND(substring);
9640 data_sub = PyUnicode_DATA(substring);
9641 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9642
9643 if (direction > 0)
9644 offset = end;
9645 else
9646 offset = start;
9647
9648 if (PyUnicode_READ(kind_self, data_self, offset) ==
9649 PyUnicode_READ(kind_sub, data_sub, 0) &&
9650 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9651 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9652 /* If both are of the same kind, memcmp is sufficient */
9653 if (kind_self == kind_sub) {
9654 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009655 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009656 data_sub,
9657 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009658 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009659 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009660 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009661 else {
9662 /* We do not need to compare 0 and len(substring)-1 because
9663 the if statement above ensured already that they are equal
9664 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009665 for (i = 1; i < end_sub; ++i) {
9666 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9667 PyUnicode_READ(kind_sub, data_sub, i))
9668 return 0;
9669 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009670 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009671 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009672 }
9673
9674 return 0;
9675}
9676
Alexander Belopolsky40018472011-02-26 01:02:56 +00009677Py_ssize_t
9678PyUnicode_Tailmatch(PyObject *str,
9679 PyObject *substr,
9680 Py_ssize_t start,
9681 Py_ssize_t end,
9682 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009683{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009684 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009685 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009686
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009687 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688}
9689
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009690static PyObject *
9691ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009692{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009693 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9694 char *resdata, *data = PyUnicode_DATA(self);
9695 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009696
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009697 res = PyUnicode_New(len, 127);
9698 if (res == NULL)
9699 return NULL;
9700 resdata = PyUnicode_DATA(res);
9701 if (lower)
9702 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009703 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009704 _Py_bytes_upper(resdata, data, len);
9705 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009706}
9707
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009708static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009709handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009710{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009711 Py_ssize_t j;
9712 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009713 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009714 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009715
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009716 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9717
9718 where ! is a negation and \p{xxx} is a character with property xxx.
9719 */
9720 for (j = i - 1; j >= 0; j--) {
9721 c = PyUnicode_READ(kind, data, j);
9722 if (!_PyUnicode_IsCaseIgnorable(c))
9723 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009724 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009725 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9726 if (final_sigma) {
9727 for (j = i + 1; j < length; j++) {
9728 c = PyUnicode_READ(kind, data, j);
9729 if (!_PyUnicode_IsCaseIgnorable(c))
9730 break;
9731 }
9732 final_sigma = j == length || !_PyUnicode_IsCased(c);
9733 }
9734 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009735}
9736
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009737static int
9738lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9739 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009740{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009741 /* Obscure special case. */
9742 if (c == 0x3A3) {
9743 mapped[0] = handle_capital_sigma(kind, data, length, i);
9744 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009745 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009746 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009747}
9748
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009749static Py_ssize_t
9750do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009751{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009752 Py_ssize_t i, k = 0;
9753 int n_res, j;
9754 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009755
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009756 c = PyUnicode_READ(kind, data, 0);
9757 n_res = _PyUnicode_ToUpperFull(c, mapped);
9758 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009759 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009760 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009761 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009762 for (i = 1; i < length; i++) {
9763 c = PyUnicode_READ(kind, data, i);
9764 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9765 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009766 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009767 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009768 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009769 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009770 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009771}
9772
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009773static Py_ssize_t
9774do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9775 Py_ssize_t i, k = 0;
9776
9777 for (i = 0; i < length; i++) {
9778 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9779 int n_res, j;
9780 if (Py_UNICODE_ISUPPER(c)) {
9781 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9782 }
9783 else if (Py_UNICODE_ISLOWER(c)) {
9784 n_res = _PyUnicode_ToUpperFull(c, mapped);
9785 }
9786 else {
9787 n_res = 1;
9788 mapped[0] = c;
9789 }
9790 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009791 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009792 res[k++] = mapped[j];
9793 }
9794 }
9795 return k;
9796}
9797
9798static Py_ssize_t
9799do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9800 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009801{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009802 Py_ssize_t i, k = 0;
9803
9804 for (i = 0; i < length; i++) {
9805 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9806 int n_res, j;
9807 if (lower)
9808 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9809 else
9810 n_res = _PyUnicode_ToUpperFull(c, mapped);
9811 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009812 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009813 res[k++] = mapped[j];
9814 }
9815 }
9816 return k;
9817}
9818
9819static Py_ssize_t
9820do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9821{
9822 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9823}
9824
9825static Py_ssize_t
9826do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9827{
9828 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9829}
9830
Benjamin Petersone51757f2012-01-12 21:10:29 -05009831static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009832do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9833{
9834 Py_ssize_t i, k = 0;
9835
9836 for (i = 0; i < length; i++) {
9837 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9838 Py_UCS4 mapped[3];
9839 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9840 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009841 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009842 res[k++] = mapped[j];
9843 }
9844 }
9845 return k;
9846}
9847
9848static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009849do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9850{
9851 Py_ssize_t i, k = 0;
9852 int previous_is_cased;
9853
9854 previous_is_cased = 0;
9855 for (i = 0; i < length; i++) {
9856 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9857 Py_UCS4 mapped[3];
9858 int n_res, j;
9859
9860 if (previous_is_cased)
9861 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9862 else
9863 n_res = _PyUnicode_ToTitleFull(c, mapped);
9864
9865 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009866 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009867 res[k++] = mapped[j];
9868 }
9869
9870 previous_is_cased = _PyUnicode_IsCased(c);
9871 }
9872 return k;
9873}
9874
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009875static PyObject *
9876case_operation(PyObject *self,
9877 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9878{
9879 PyObject *res = NULL;
9880 Py_ssize_t length, newlength = 0;
9881 int kind, outkind;
9882 void *data, *outdata;
9883 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9884
Benjamin Petersoneea48462012-01-16 14:28:50 -05009885 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009886
9887 kind = PyUnicode_KIND(self);
9888 data = PyUnicode_DATA(self);
9889 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009890 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009891 PyErr_SetString(PyExc_OverflowError, "string is too long");
9892 return NULL;
9893 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009894 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009895 if (tmp == NULL)
9896 return PyErr_NoMemory();
9897 newlength = perform(kind, data, length, tmp, &maxchar);
9898 res = PyUnicode_New(newlength, maxchar);
9899 if (res == NULL)
9900 goto leave;
9901 tmpend = tmp + newlength;
9902 outdata = PyUnicode_DATA(res);
9903 outkind = PyUnicode_KIND(res);
9904 switch (outkind) {
9905 case PyUnicode_1BYTE_KIND:
9906 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9907 break;
9908 case PyUnicode_2BYTE_KIND:
9909 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9910 break;
9911 case PyUnicode_4BYTE_KIND:
9912 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9913 break;
9914 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009915 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009916 }
9917 leave:
9918 PyMem_FREE(tmp);
9919 return res;
9920}
9921
Tim Peters8ce9f162004-08-27 01:49:32 +00009922PyObject *
9923PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009924{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009925 PyObject *res;
9926 PyObject *fseq;
9927 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009928 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009929
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009930 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009931 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009932 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009933 }
9934
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009935 /* NOTE: the following code can't call back into Python code,
9936 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009937 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009938
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009939 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009940 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009941 res = _PyUnicode_JoinArray(separator, items, seqlen);
9942 Py_DECREF(fseq);
9943 return res;
9944}
9945
9946PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +02009947_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009948{
9949 PyObject *res = NULL; /* the result */
9950 PyObject *sep = NULL;
9951 Py_ssize_t seplen;
9952 PyObject *item;
9953 Py_ssize_t sz, i, res_offset;
9954 Py_UCS4 maxchar;
9955 Py_UCS4 item_maxchar;
9956 int use_memcpy;
9957 unsigned char *res_data = NULL, *sep_data = NULL;
9958 PyObject *last_obj;
9959 unsigned int kind = 0;
9960
Tim Peters05eba1f2004-08-27 21:32:02 +00009961 /* If empty sequence, return u"". */
9962 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009963 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009964 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009965
Tim Peters05eba1f2004-08-27 21:32:02 +00009966 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009967 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009968 if (seqlen == 1) {
9969 if (PyUnicode_CheckExact(items[0])) {
9970 res = items[0];
9971 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009972 return res;
9973 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009974 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009975 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009976 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009977 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009978 /* Set up sep and seplen */
9979 if (separator == NULL) {
9980 /* fall back to a blank space separator */
9981 sep = PyUnicode_FromOrdinal(' ');
9982 if (!sep)
9983 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009984 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009985 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009986 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009987 else {
9988 if (!PyUnicode_Check(separator)) {
9989 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009990 "separator: expected str instance,"
9991 " %.80s found",
9992 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +02009993 goto onError;
9994 }
9995 if (PyUnicode_READY(separator))
9996 goto onError;
9997 sep = separator;
9998 seplen = PyUnicode_GET_LENGTH(separator);
9999 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10000 /* inc refcount to keep this code path symmetric with the
10001 above case of a blank separator */
10002 Py_INCREF(sep);
10003 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010004 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010005 }
10006
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010007 /* There are at least two things to join, or else we have a subclass
10008 * of str in the sequence.
10009 * Do a pre-pass to figure out the total amount of space we'll
10010 * need (sz), and see whether all argument are strings.
10011 */
10012 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010013#ifdef Py_DEBUG
10014 use_memcpy = 0;
10015#else
10016 use_memcpy = 1;
10017#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010018 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010019 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010020 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010021 if (!PyUnicode_Check(item)) {
10022 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010023 "sequence item %zd: expected str instance,"
10024 " %.80s found",
10025 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010026 goto onError;
10027 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010028 if (PyUnicode_READY(item) == -1)
10029 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010030 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010032 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010033 if (i != 0) {
10034 add_sz += seplen;
10035 }
10036 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010037 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010038 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010039 goto onError;
10040 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010041 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010042 if (use_memcpy && last_obj != NULL) {
10043 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10044 use_memcpy = 0;
10045 }
10046 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010047 }
Tim Petersced69f82003-09-16 20:30:58 +000010048
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010049 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010050 if (res == NULL)
10051 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010052
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010053 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010054#ifdef Py_DEBUG
10055 use_memcpy = 0;
10056#else
10057 if (use_memcpy) {
10058 res_data = PyUnicode_1BYTE_DATA(res);
10059 kind = PyUnicode_KIND(res);
10060 if (seplen != 0)
10061 sep_data = PyUnicode_1BYTE_DATA(sep);
10062 }
10063#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010064 if (use_memcpy) {
10065 for (i = 0; i < seqlen; ++i) {
10066 Py_ssize_t itemlen;
10067 item = items[i];
10068
10069 /* Copy item, and maybe the separator. */
10070 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010071 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010072 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010073 kind * seplen);
10074 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010075 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010076
10077 itemlen = PyUnicode_GET_LENGTH(item);
10078 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010079 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010080 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010081 kind * itemlen);
10082 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010083 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010084 }
10085 assert(res_data == PyUnicode_1BYTE_DATA(res)
10086 + kind * PyUnicode_GET_LENGTH(res));
10087 }
10088 else {
10089 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10090 Py_ssize_t itemlen;
10091 item = items[i];
10092
10093 /* Copy item, and maybe the separator. */
10094 if (i && seplen != 0) {
10095 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10096 res_offset += seplen;
10097 }
10098
10099 itemlen = PyUnicode_GET_LENGTH(item);
10100 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010101 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010102 res_offset += itemlen;
10103 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010104 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010105 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010106 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010108 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010109 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010110 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010111
Benjamin Peterson29060642009-01-31 22:14:21 +000010112 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010113 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010114 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010115 return NULL;
10116}
10117
Victor Stinnerd3f08822012-05-29 12:57:52 +020010118void
10119_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10120 Py_UCS4 fill_char)
10121{
10122 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010123 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010124 assert(PyUnicode_IS_READY(unicode));
10125 assert(unicode_modifiable(unicode));
10126 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10127 assert(start >= 0);
10128 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010129 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010130}
10131
Victor Stinner3fe55312012-01-04 00:33:50 +010010132Py_ssize_t
10133PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10134 Py_UCS4 fill_char)
10135{
10136 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010137
10138 if (!PyUnicode_Check(unicode)) {
10139 PyErr_BadInternalCall();
10140 return -1;
10141 }
10142 if (PyUnicode_READY(unicode) == -1)
10143 return -1;
10144 if (unicode_check_modifiable(unicode))
10145 return -1;
10146
Victor Stinnerd3f08822012-05-29 12:57:52 +020010147 if (start < 0) {
10148 PyErr_SetString(PyExc_IndexError, "string index out of range");
10149 return -1;
10150 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010151 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10152 PyErr_SetString(PyExc_ValueError,
10153 "fill character is bigger than "
10154 "the string maximum character");
10155 return -1;
10156 }
10157
10158 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10159 length = Py_MIN(maxlen, length);
10160 if (length <= 0)
10161 return 0;
10162
Victor Stinnerd3f08822012-05-29 12:57:52 +020010163 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010164 return length;
10165}
10166
Victor Stinner9310abb2011-10-05 00:59:23 +020010167static PyObject *
10168pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010169 Py_ssize_t left,
10170 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 PyObject *u;
10174 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010175 int kind;
10176 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010177
10178 if (left < 0)
10179 left = 0;
10180 if (right < 0)
10181 right = 0;
10182
Victor Stinnerc4b49542011-12-11 22:44:26 +010010183 if (left == 0 && right == 0)
10184 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10187 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010188 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10189 return NULL;
10190 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010192 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010193 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010194 if (!u)
10195 return NULL;
10196
10197 kind = PyUnicode_KIND(u);
10198 data = PyUnicode_DATA(u);
10199 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010200 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010201 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010202 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010203 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010204 assert(_PyUnicode_CheckConsistency(u, 1));
10205 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010206}
10207
Alexander Belopolsky40018472011-02-26 01:02:56 +000010208PyObject *
10209PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010210{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010211 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010212
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010213 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010214 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010215
Benjamin Petersonead6b532011-12-20 17:23:42 -060010216 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010218 if (PyUnicode_IS_ASCII(string))
10219 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010220 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010221 PyUnicode_GET_LENGTH(string), keepends);
10222 else
10223 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010224 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010225 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 break;
10227 case PyUnicode_2BYTE_KIND:
10228 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010229 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010230 PyUnicode_GET_LENGTH(string), keepends);
10231 break;
10232 case PyUnicode_4BYTE_KIND:
10233 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010234 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 PyUnicode_GET_LENGTH(string), keepends);
10236 break;
10237 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010238 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010240 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010241}
10242
Alexander Belopolsky40018472011-02-26 01:02:56 +000010243static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010244split(PyObject *self,
10245 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010246 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010247{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010248 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 void *buf1, *buf2;
10250 Py_ssize_t len1, len2;
10251 PyObject* out;
10252
Guido van Rossumd57fd912000-03-10 22:53:23 +000010253 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010254 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010255
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 if (PyUnicode_READY(self) == -1)
10257 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010260 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010261 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010262 if (PyUnicode_IS_ASCII(self))
10263 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010264 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010265 PyUnicode_GET_LENGTH(self), maxcount
10266 );
10267 else
10268 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010269 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010270 PyUnicode_GET_LENGTH(self), maxcount
10271 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272 case PyUnicode_2BYTE_KIND:
10273 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010274 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275 PyUnicode_GET_LENGTH(self), maxcount
10276 );
10277 case PyUnicode_4BYTE_KIND:
10278 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010279 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 PyUnicode_GET_LENGTH(self), maxcount
10281 );
10282 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010283 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 }
10285
10286 if (PyUnicode_READY(substring) == -1)
10287 return NULL;
10288
10289 kind1 = PyUnicode_KIND(self);
10290 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010291 len1 = PyUnicode_GET_LENGTH(self);
10292 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010293 if (kind1 < kind2 || len1 < len2) {
10294 out = PyList_New(1);
10295 if (out == NULL)
10296 return NULL;
10297 Py_INCREF(self);
10298 PyList_SET_ITEM(out, 0, self);
10299 return out;
10300 }
10301 buf1 = PyUnicode_DATA(self);
10302 buf2 = PyUnicode_DATA(substring);
10303 if (kind2 != kind1) {
10304 buf2 = _PyUnicode_AsKind(substring, kind1);
10305 if (!buf2)
10306 return NULL;
10307 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010308
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010309 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010311 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10312 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010313 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010314 else
10315 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010316 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 break;
10318 case PyUnicode_2BYTE_KIND:
10319 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010320 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010321 break;
10322 case PyUnicode_4BYTE_KIND:
10323 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010324 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 break;
10326 default:
10327 out = NULL;
10328 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010329 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 PyMem_Free(buf2);
10331 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010332}
10333
Alexander Belopolsky40018472011-02-26 01:02:56 +000010334static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010335rsplit(PyObject *self,
10336 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010337 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010338{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010339 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 void *buf1, *buf2;
10341 Py_ssize_t len1, len2;
10342 PyObject* out;
10343
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010344 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010345 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010346
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 if (PyUnicode_READY(self) == -1)
10348 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010351 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010352 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010353 if (PyUnicode_IS_ASCII(self))
10354 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010355 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010356 PyUnicode_GET_LENGTH(self), maxcount
10357 );
10358 else
10359 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010360 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010361 PyUnicode_GET_LENGTH(self), maxcount
10362 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363 case PyUnicode_2BYTE_KIND:
10364 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010365 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 PyUnicode_GET_LENGTH(self), maxcount
10367 );
10368 case PyUnicode_4BYTE_KIND:
10369 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010370 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 PyUnicode_GET_LENGTH(self), maxcount
10372 );
10373 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010374 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 }
10376
10377 if (PyUnicode_READY(substring) == -1)
10378 return NULL;
10379
10380 kind1 = PyUnicode_KIND(self);
10381 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 len1 = PyUnicode_GET_LENGTH(self);
10383 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010384 if (kind1 < kind2 || len1 < len2) {
10385 out = PyList_New(1);
10386 if (out == NULL)
10387 return NULL;
10388 Py_INCREF(self);
10389 PyList_SET_ITEM(out, 0, self);
10390 return out;
10391 }
10392 buf1 = PyUnicode_DATA(self);
10393 buf2 = PyUnicode_DATA(substring);
10394 if (kind2 != kind1) {
10395 buf2 = _PyUnicode_AsKind(substring, kind1);
10396 if (!buf2)
10397 return NULL;
10398 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010399
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010400 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010402 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10403 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010404 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010405 else
10406 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010407 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 break;
10409 case PyUnicode_2BYTE_KIND:
10410 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010411 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412 break;
10413 case PyUnicode_4BYTE_KIND:
10414 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010415 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 break;
10417 default:
10418 out = NULL;
10419 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010420 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010421 PyMem_Free(buf2);
10422 return out;
10423}
10424
10425static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010426anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10427 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010428{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010429 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010431 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10432 return asciilib_find(buf1, len1, buf2, len2, offset);
10433 else
10434 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010435 case PyUnicode_2BYTE_KIND:
10436 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10437 case PyUnicode_4BYTE_KIND:
10438 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10439 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010440 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441}
10442
10443static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010444anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10445 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010446{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010447 switch (kind) {
10448 case PyUnicode_1BYTE_KIND:
10449 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10450 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10451 else
10452 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10453 case PyUnicode_2BYTE_KIND:
10454 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10455 case PyUnicode_4BYTE_KIND:
10456 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10457 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010458 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010459}
10460
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010461static void
10462replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10463 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10464{
10465 int kind = PyUnicode_KIND(u);
10466 void *data = PyUnicode_DATA(u);
10467 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10468 if (kind == PyUnicode_1BYTE_KIND) {
10469 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10470 (Py_UCS1 *)data + len,
10471 u1, u2, maxcount);
10472 }
10473 else if (kind == PyUnicode_2BYTE_KIND) {
10474 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10475 (Py_UCS2 *)data + len,
10476 u1, u2, maxcount);
10477 }
10478 else {
10479 assert(kind == PyUnicode_4BYTE_KIND);
10480 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10481 (Py_UCS4 *)data + len,
10482 u1, u2, maxcount);
10483 }
10484}
10485
Alexander Belopolsky40018472011-02-26 01:02:56 +000010486static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487replace(PyObject *self, PyObject *str1,
10488 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010489{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490 PyObject *u;
10491 char *sbuf = PyUnicode_DATA(self);
10492 char *buf1 = PyUnicode_DATA(str1);
10493 char *buf2 = PyUnicode_DATA(str2);
10494 int srelease = 0, release1 = 0, release2 = 0;
10495 int skind = PyUnicode_KIND(self);
10496 int kind1 = PyUnicode_KIND(str1);
10497 int kind2 = PyUnicode_KIND(str2);
10498 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10499 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10500 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010501 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010502 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010503
10504 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010505 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010507 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010508
Victor Stinner59de0ee2011-10-07 10:01:28 +020010509 if (str1 == str2)
10510 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511
Victor Stinner49a0a212011-10-12 23:46:10 +020010512 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010513 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10514 if (maxchar < maxchar_str1)
10515 /* substring too wide to be present */
10516 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010517 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10518 /* Replacing str1 with str2 may cause a maxchar reduction in the
10519 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010520 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010521 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010522
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010524 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010526 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010527 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010528 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010529 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010530 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010531
Victor Stinner69ed0f42013-04-09 21:48:24 +020010532 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010533 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010534 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010535 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010536 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010538 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010540
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010541 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10542 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010543 }
10544 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010545 int rkind = skind;
10546 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010547 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010548
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010549 if (kind1 < rkind) {
10550 /* widen substring */
10551 buf1 = _PyUnicode_AsKind(str1, rkind);
10552 if (!buf1) goto error;
10553 release1 = 1;
10554 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010555 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010556 if (i < 0)
10557 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 if (rkind > kind2) {
10559 /* widen replacement */
10560 buf2 = _PyUnicode_AsKind(str2, rkind);
10561 if (!buf2) goto error;
10562 release2 = 1;
10563 }
10564 else if (rkind < kind2) {
10565 /* widen self and buf1 */
10566 rkind = kind2;
10567 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010568 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010569 sbuf = _PyUnicode_AsKind(self, rkind);
10570 if (!sbuf) goto error;
10571 srelease = 1;
10572 buf1 = _PyUnicode_AsKind(str1, rkind);
10573 if (!buf1) goto error;
10574 release1 = 1;
10575 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010576 u = PyUnicode_New(slen, maxchar);
10577 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010578 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010579 assert(PyUnicode_KIND(u) == rkind);
10580 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010581
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010582 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010583 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010584 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010586 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010588
10589 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010590 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010591 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010592 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010593 if (i == -1)
10594 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010595 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010597 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010599 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010600 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010601 }
10602 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010604 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 int rkind = skind;
10606 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010609 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 buf1 = _PyUnicode_AsKind(str1, rkind);
10611 if (!buf1) goto error;
10612 release1 = 1;
10613 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010614 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010615 if (n == 0)
10616 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010618 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 buf2 = _PyUnicode_AsKind(str2, rkind);
10620 if (!buf2) goto error;
10621 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010622 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010624 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 rkind = kind2;
10626 sbuf = _PyUnicode_AsKind(self, rkind);
10627 if (!sbuf) goto error;
10628 srelease = 1;
10629 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010630 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010631 buf1 = _PyUnicode_AsKind(str1, rkind);
10632 if (!buf1) goto error;
10633 release1 = 1;
10634 }
10635 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10636 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010637 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 PyErr_SetString(PyExc_OverflowError,
10639 "replace string is too long");
10640 goto error;
10641 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010642 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010643 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010644 _Py_INCREF_UNICODE_EMPTY();
10645 if (!unicode_empty)
10646 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010647 u = unicode_empty;
10648 goto done;
10649 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010650 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 PyErr_SetString(PyExc_OverflowError,
10652 "replace string is too long");
10653 goto error;
10654 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010655 u = PyUnicode_New(new_size, maxchar);
10656 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010658 assert(PyUnicode_KIND(u) == rkind);
10659 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 ires = i = 0;
10661 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010662 while (n-- > 0) {
10663 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010664 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010665 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010666 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010667 if (j == -1)
10668 break;
10669 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010670 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010671 memcpy(res + rkind * ires,
10672 sbuf + rkind * i,
10673 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010675 }
10676 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010678 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010680 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010682 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010684 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010686 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010687 memcpy(res + rkind * ires,
10688 sbuf + rkind * i,
10689 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010690 }
10691 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010692 /* interleave */
10693 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010694 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010696 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010698 if (--n <= 0)
10699 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010700 memcpy(res + rkind * ires,
10701 sbuf + rkind * i,
10702 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010703 ires++;
10704 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010705 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010706 memcpy(res + rkind * ires,
10707 sbuf + rkind * i,
10708 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010709 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010710 }
10711
10712 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010713 unicode_adjust_maxchar(&u);
10714 if (u == NULL)
10715 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010716 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010717
10718 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 if (srelease)
10720 PyMem_FREE(sbuf);
10721 if (release1)
10722 PyMem_FREE(buf1);
10723 if (release2)
10724 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010725 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010727
Benjamin Peterson29060642009-01-31 22:14:21 +000010728 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010729 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730 if (srelease)
10731 PyMem_FREE(sbuf);
10732 if (release1)
10733 PyMem_FREE(buf1);
10734 if (release2)
10735 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010736 return unicode_result_unchanged(self);
10737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 error:
10739 if (srelease && sbuf)
10740 PyMem_FREE(sbuf);
10741 if (release1 && buf1)
10742 PyMem_FREE(buf1);
10743 if (release2 && buf2)
10744 PyMem_FREE(buf2);
10745 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010746}
10747
10748/* --- Unicode Object Methods --------------------------------------------- */
10749
INADA Naoki3ae20562017-01-16 20:41:20 +090010750/*[clinic input]
10751str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010752
INADA Naoki3ae20562017-01-16 20:41:20 +090010753Return a version of the string where each word is titlecased.
10754
10755More specifically, words start with uppercased characters and all remaining
10756cased characters have lower case.
10757[clinic start generated code]*/
10758
10759static PyObject *
10760unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010761/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010762{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010763 if (PyUnicode_READY(self) == -1)
10764 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010765 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766}
10767
INADA Naoki3ae20562017-01-16 20:41:20 +090010768/*[clinic input]
10769str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010770
INADA Naoki3ae20562017-01-16 20:41:20 +090010771Return a capitalized version of the string.
10772
10773More specifically, make the first character have upper case and the rest lower
10774case.
10775[clinic start generated code]*/
10776
10777static PyObject *
10778unicode_capitalize_impl(PyObject *self)
10779/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010780{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010781 if (PyUnicode_READY(self) == -1)
10782 return NULL;
10783 if (PyUnicode_GET_LENGTH(self) == 0)
10784 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010785 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010786}
10787
INADA Naoki3ae20562017-01-16 20:41:20 +090010788/*[clinic input]
10789str.casefold as unicode_casefold
10790
10791Return a version of the string suitable for caseless comparisons.
10792[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010793
10794static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010795unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010796/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010797{
10798 if (PyUnicode_READY(self) == -1)
10799 return NULL;
10800 if (PyUnicode_IS_ASCII(self))
10801 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010802 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010803}
10804
10805
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010806/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010807
10808static int
10809convert_uc(PyObject *obj, void *addr)
10810{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010811 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010812
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010813 if (!PyUnicode_Check(obj)) {
10814 PyErr_Format(PyExc_TypeError,
10815 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010816 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010817 return 0;
10818 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010819 if (PyUnicode_READY(obj) < 0)
10820 return 0;
10821 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010822 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010823 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010824 return 0;
10825 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010826 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010827 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010828}
10829
INADA Naoki3ae20562017-01-16 20:41:20 +090010830/*[clinic input]
10831str.center as unicode_center
10832
10833 width: Py_ssize_t
10834 fillchar: Py_UCS4 = ' '
10835 /
10836
10837Return a centered string of length width.
10838
10839Padding is done using the specified fill character (default is a space).
10840[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010841
10842static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010843unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10844/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010845{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010846 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010847
Benjamin Petersonbac79492012-01-14 13:34:47 -050010848 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010849 return NULL;
10850
Victor Stinnerc4b49542011-12-11 22:44:26 +010010851 if (PyUnicode_GET_LENGTH(self) >= width)
10852 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010853
Victor Stinnerc4b49542011-12-11 22:44:26 +010010854 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010855 left = marg / 2 + (marg & width & 1);
10856
Victor Stinner9310abb2011-10-05 00:59:23 +020010857 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010858}
10859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010860/* This function assumes that str1 and str2 are readied by the caller. */
10861
Marc-André Lemburge5034372000-08-08 08:04:29 +000010862static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010863unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010864{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010865#define COMPARE(TYPE1, TYPE2) \
10866 do { \
10867 TYPE1* p1 = (TYPE1 *)data1; \
10868 TYPE2* p2 = (TYPE2 *)data2; \
10869 TYPE1* end = p1 + len; \
10870 Py_UCS4 c1, c2; \
10871 for (; p1 != end; p1++, p2++) { \
10872 c1 = *p1; \
10873 c2 = *p2; \
10874 if (c1 != c2) \
10875 return (c1 < c2) ? -1 : 1; \
10876 } \
10877 } \
10878 while (0)
10879
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010880 int kind1, kind2;
10881 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010882 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010883
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010884 kind1 = PyUnicode_KIND(str1);
10885 kind2 = PyUnicode_KIND(str2);
10886 data1 = PyUnicode_DATA(str1);
10887 data2 = PyUnicode_DATA(str2);
10888 len1 = PyUnicode_GET_LENGTH(str1);
10889 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010890 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010891
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010892 switch(kind1) {
10893 case PyUnicode_1BYTE_KIND:
10894 {
10895 switch(kind2) {
10896 case PyUnicode_1BYTE_KIND:
10897 {
10898 int cmp = memcmp(data1, data2, len);
10899 /* normalize result of memcmp() into the range [-1; 1] */
10900 if (cmp < 0)
10901 return -1;
10902 if (cmp > 0)
10903 return 1;
10904 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010905 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010906 case PyUnicode_2BYTE_KIND:
10907 COMPARE(Py_UCS1, Py_UCS2);
10908 break;
10909 case PyUnicode_4BYTE_KIND:
10910 COMPARE(Py_UCS1, Py_UCS4);
10911 break;
10912 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010913 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010914 }
10915 break;
10916 }
10917 case PyUnicode_2BYTE_KIND:
10918 {
10919 switch(kind2) {
10920 case PyUnicode_1BYTE_KIND:
10921 COMPARE(Py_UCS2, Py_UCS1);
10922 break;
10923 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010924 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010925 COMPARE(Py_UCS2, Py_UCS2);
10926 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010927 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010928 case PyUnicode_4BYTE_KIND:
10929 COMPARE(Py_UCS2, Py_UCS4);
10930 break;
10931 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010932 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010933 }
10934 break;
10935 }
10936 case PyUnicode_4BYTE_KIND:
10937 {
10938 switch(kind2) {
10939 case PyUnicode_1BYTE_KIND:
10940 COMPARE(Py_UCS4, Py_UCS1);
10941 break;
10942 case PyUnicode_2BYTE_KIND:
10943 COMPARE(Py_UCS4, Py_UCS2);
10944 break;
10945 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010946 {
10947#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10948 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10949 /* normalize result of wmemcmp() into the range [-1; 1] */
10950 if (cmp < 0)
10951 return -1;
10952 if (cmp > 0)
10953 return 1;
10954#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010955 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010956#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010957 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010958 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010959 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010960 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010961 }
10962 break;
10963 }
10964 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010965 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000010966 }
10967
Victor Stinner770e19e2012-10-04 22:59:45 +020010968 if (len1 == len2)
10969 return 0;
10970 if (len1 < len2)
10971 return -1;
10972 else
10973 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010974
10975#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010976}
10977
Benjamin Peterson621b4302016-09-09 13:54:34 -070010978static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010979unicode_compare_eq(PyObject *str1, PyObject *str2)
10980{
10981 int kind;
10982 void *data1, *data2;
10983 Py_ssize_t len;
10984 int cmp;
10985
Victor Stinnere5567ad2012-10-23 02:48:49 +020010986 len = PyUnicode_GET_LENGTH(str1);
10987 if (PyUnicode_GET_LENGTH(str2) != len)
10988 return 0;
10989 kind = PyUnicode_KIND(str1);
10990 if (PyUnicode_KIND(str2) != kind)
10991 return 0;
10992 data1 = PyUnicode_DATA(str1);
10993 data2 = PyUnicode_DATA(str2);
10994
10995 cmp = memcmp(data1, data2, len * kind);
10996 return (cmp == 0);
10997}
10998
10999
Alexander Belopolsky40018472011-02-26 01:02:56 +000011000int
11001PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011003 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11004 if (PyUnicode_READY(left) == -1 ||
11005 PyUnicode_READY(right) == -1)
11006 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011007
11008 /* a string is equal to itself */
11009 if (left == right)
11010 return 0;
11011
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011012 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011013 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011014 PyErr_Format(PyExc_TypeError,
11015 "Can't compare %.100s and %.100s",
11016 left->ob_type->tp_name,
11017 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018 return -1;
11019}
11020
Martin v. Löwis5b222132007-06-10 09:51:05 +000011021int
11022PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11023{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011024 Py_ssize_t i;
11025 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011026 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011027 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028
Victor Stinner910337b2011-10-03 03:20:16 +020011029 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011030 if (!PyUnicode_IS_READY(uni)) {
11031 const wchar_t *ws = _PyUnicode_WSTR(uni);
11032 /* Compare Unicode string and source character set string */
11033 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11034 if (chr != ustr[i])
11035 return (chr < ustr[i]) ? -1 : 1;
11036 }
11037 /* This check keeps Python strings that end in '\0' from comparing equal
11038 to C strings identical up to that point. */
11039 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11040 return 1; /* uni is longer */
11041 if (ustr[i])
11042 return -1; /* str is longer */
11043 return 0;
11044 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011045 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011046 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011047 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011048 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011049 size_t len, len2 = strlen(str);
11050 int cmp;
11051
11052 len = Py_MIN(len1, len2);
11053 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011054 if (cmp != 0) {
11055 if (cmp < 0)
11056 return -1;
11057 else
11058 return 1;
11059 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011060 if (len1 > len2)
11061 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011062 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011063 return -1; /* str is longer */
11064 return 0;
11065 }
11066 else {
11067 void *data = PyUnicode_DATA(uni);
11068 /* Compare Unicode string and source character set string */
11069 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011070 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011071 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11072 /* This check keeps Python strings that end in '\0' from comparing equal
11073 to C strings identical up to that point. */
11074 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11075 return 1; /* uni is longer */
11076 if (str[i])
11077 return -1; /* str is longer */
11078 return 0;
11079 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011080}
11081
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011082static int
11083non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11084{
11085 size_t i, len;
11086 const wchar_t *p;
11087 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11088 if (strlen(str) != len)
11089 return 0;
11090 p = _PyUnicode_WSTR(unicode);
11091 assert(p);
11092 for (i = 0; i < len; i++) {
11093 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011094 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011095 return 0;
11096 }
11097 return 1;
11098}
11099
11100int
11101_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11102{
11103 size_t len;
11104 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011105 assert(str);
11106#ifndef NDEBUG
11107 for (const char *p = str; *p; p++) {
11108 assert((unsigned char)*p < 128);
11109 }
11110#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011111 if (PyUnicode_READY(unicode) == -1) {
11112 /* Memory error or bad data */
11113 PyErr_Clear();
11114 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11115 }
11116 if (!PyUnicode_IS_ASCII(unicode))
11117 return 0;
11118 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11119 return strlen(str) == len &&
11120 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11121}
11122
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011123int
11124_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11125{
11126 PyObject *right_uni;
11127 Py_hash_t hash;
11128
11129 assert(_PyUnicode_CHECK(left));
11130 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011131#ifndef NDEBUG
11132 for (const char *p = right->string; *p; p++) {
11133 assert((unsigned char)*p < 128);
11134 }
11135#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011136
11137 if (PyUnicode_READY(left) == -1) {
11138 /* memory error or bad data */
11139 PyErr_Clear();
11140 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11141 }
11142
11143 if (!PyUnicode_IS_ASCII(left))
11144 return 0;
11145
11146 right_uni = _PyUnicode_FromId(right); /* borrowed */
11147 if (right_uni == NULL) {
11148 /* memory error or bad data */
11149 PyErr_Clear();
11150 return _PyUnicode_EqualToASCIIString(left, right->string);
11151 }
11152
11153 if (left == right_uni)
11154 return 1;
11155
11156 if (PyUnicode_CHECK_INTERNED(left))
11157 return 0;
11158
INADA Naoki7cc95f52018-01-28 02:07:09 +090011159 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011160 hash = _PyUnicode_HASH(left);
11161 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11162 return 0;
11163
11164 return unicode_compare_eq(left, right_uni);
11165}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011166
Alexander Belopolsky40018472011-02-26 01:02:56 +000011167PyObject *
11168PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011169{
11170 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011171
Victor Stinnere5567ad2012-10-23 02:48:49 +020011172 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11173 Py_RETURN_NOTIMPLEMENTED;
11174
11175 if (PyUnicode_READY(left) == -1 ||
11176 PyUnicode_READY(right) == -1)
11177 return NULL;
11178
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011179 if (left == right) {
11180 switch (op) {
11181 case Py_EQ:
11182 case Py_LE:
11183 case Py_GE:
11184 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011185 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011186 case Py_NE:
11187 case Py_LT:
11188 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011189 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011190 default:
11191 PyErr_BadArgument();
11192 return NULL;
11193 }
11194 }
11195 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011196 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011197 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011198 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011199 }
11200 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011201 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011202 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011203 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011204}
11205
Alexander Belopolsky40018472011-02-26 01:02:56 +000011206int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011207_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11208{
11209 return unicode_eq(aa, bb);
11210}
11211
11212int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011213PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011214{
Victor Stinner77282cb2013-04-14 19:22:47 +020011215 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011216 void *buf1, *buf2;
11217 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011218 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011219
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011220 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011221 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011222 "'in <string>' requires string as left operand, not %.100s",
11223 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011224 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011225 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011226 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011227 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011228 if (ensure_unicode(str) < 0)
11229 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011231 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011232 kind2 = PyUnicode_KIND(substr);
11233 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011234 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011235 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011236 len2 = PyUnicode_GET_LENGTH(substr);
11237 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011238 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011239 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011240 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011241 if (len2 == 1) {
11242 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11243 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011244 return result;
11245 }
11246 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011247 buf2 = _PyUnicode_AsKind(substr, kind1);
11248 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011249 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011250 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011251
Victor Stinner77282cb2013-04-14 19:22:47 +020011252 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011253 case PyUnicode_1BYTE_KIND:
11254 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11255 break;
11256 case PyUnicode_2BYTE_KIND:
11257 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11258 break;
11259 case PyUnicode_4BYTE_KIND:
11260 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11261 break;
11262 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011263 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011264 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011265
Victor Stinner77282cb2013-04-14 19:22:47 +020011266 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011267 PyMem_Free(buf2);
11268
Guido van Rossum403d68b2000-03-13 15:55:09 +000011269 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011270}
11271
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272/* Concat to string or Unicode object giving a new Unicode object. */
11273
Alexander Belopolsky40018472011-02-26 01:02:56 +000011274PyObject *
11275PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011277 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011278 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011279 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011281 if (ensure_unicode(left) < 0)
11282 return NULL;
11283
11284 if (!PyUnicode_Check(right)) {
11285 PyErr_Format(PyExc_TypeError,
11286 "can only concatenate str (not \"%.200s\") to str",
11287 right->ob_type->tp_name);
11288 return NULL;
11289 }
11290 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011291 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011292
11293 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011294 if (left == unicode_empty)
11295 return PyUnicode_FromObject(right);
11296 if (right == unicode_empty)
11297 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011299 left_len = PyUnicode_GET_LENGTH(left);
11300 right_len = PyUnicode_GET_LENGTH(right);
11301 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011302 PyErr_SetString(PyExc_OverflowError,
11303 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011304 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011305 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011306 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011307
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011308 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11309 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011310 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011311
Guido van Rossumd57fd912000-03-10 22:53:23 +000011312 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011313 result = PyUnicode_New(new_len, maxchar);
11314 if (result == NULL)
11315 return NULL;
11316 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11317 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11318 assert(_PyUnicode_CheckConsistency(result, 1));
11319 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320}
11321
Walter Dörwald1ab83302007-05-18 17:15:44 +000011322void
Victor Stinner23e56682011-10-03 03:54:37 +020011323PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011324{
Victor Stinner23e56682011-10-03 03:54:37 +020011325 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011326 Py_UCS4 maxchar, maxchar2;
11327 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011328
11329 if (p_left == NULL) {
11330 if (!PyErr_Occurred())
11331 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011332 return;
11333 }
Victor Stinner23e56682011-10-03 03:54:37 +020011334 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011335 if (right == NULL || left == NULL
11336 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011337 if (!PyErr_Occurred())
11338 PyErr_BadInternalCall();
11339 goto error;
11340 }
11341
Benjamin Petersonbac79492012-01-14 13:34:47 -050011342 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011343 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011344 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011345 goto error;
11346
Victor Stinner488fa492011-12-12 00:01:39 +010011347 /* Shortcuts */
11348 if (left == unicode_empty) {
11349 Py_DECREF(left);
11350 Py_INCREF(right);
11351 *p_left = right;
11352 return;
11353 }
11354 if (right == unicode_empty)
11355 return;
11356
11357 left_len = PyUnicode_GET_LENGTH(left);
11358 right_len = PyUnicode_GET_LENGTH(right);
11359 if (left_len > PY_SSIZE_T_MAX - right_len) {
11360 PyErr_SetString(PyExc_OverflowError,
11361 "strings are too large to concat");
11362 goto error;
11363 }
11364 new_len = left_len + right_len;
11365
11366 if (unicode_modifiable(left)
11367 && PyUnicode_CheckExact(right)
11368 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011369 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11370 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011371 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011372 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011373 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11374 {
11375 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011376 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011377 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011378
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011379 /* copy 'right' into the newly allocated area of 'left' */
11380 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011381 }
Victor Stinner488fa492011-12-12 00:01:39 +010011382 else {
11383 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11384 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011385 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011386
Victor Stinner488fa492011-12-12 00:01:39 +010011387 /* Concat the two Unicode strings */
11388 res = PyUnicode_New(new_len, maxchar);
11389 if (res == NULL)
11390 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011391 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11392 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011393 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011394 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011395 }
11396 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011397 return;
11398
11399error:
Victor Stinner488fa492011-12-12 00:01:39 +010011400 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011401}
11402
11403void
11404PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11405{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011406 PyUnicode_Append(pleft, right);
11407 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011408}
11409
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011410/*
11411Wraps stringlib_parse_args_finds() and additionally ensures that the
11412first argument is a unicode object.
11413*/
11414
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011415static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011416parse_args_finds_unicode(const char * function_name, PyObject *args,
11417 PyObject **substring,
11418 Py_ssize_t *start, Py_ssize_t *end)
11419{
11420 if(stringlib_parse_args_finds(function_name, args, substring,
11421 start, end)) {
11422 if (ensure_unicode(*substring) < 0)
11423 return 0;
11424 return 1;
11425 }
11426 return 0;
11427}
11428
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011429PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011430 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011432Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011433string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011434interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435
11436static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011437unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011439 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011440 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011441 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011443 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011444 void *buf1, *buf2;
11445 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011447 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011448 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450 kind1 = PyUnicode_KIND(self);
11451 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011452 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011453 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011455 len1 = PyUnicode_GET_LENGTH(self);
11456 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011457 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011458 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011459 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011460
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011461 buf1 = PyUnicode_DATA(self);
11462 buf2 = PyUnicode_DATA(substring);
11463 if (kind2 != kind1) {
11464 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011465 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011466 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011467 }
11468 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011469 case PyUnicode_1BYTE_KIND:
11470 iresult = ucs1lib_count(
11471 ((Py_UCS1*)buf1) + start, end - start,
11472 buf2, len2, PY_SSIZE_T_MAX
11473 );
11474 break;
11475 case PyUnicode_2BYTE_KIND:
11476 iresult = ucs2lib_count(
11477 ((Py_UCS2*)buf1) + start, end - start,
11478 buf2, len2, PY_SSIZE_T_MAX
11479 );
11480 break;
11481 case PyUnicode_4BYTE_KIND:
11482 iresult = ucs4lib_count(
11483 ((Py_UCS4*)buf1) + start, end - start,
11484 buf2, len2, PY_SSIZE_T_MAX
11485 );
11486 break;
11487 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011488 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011489 }
11490
11491 result = PyLong_FromSsize_t(iresult);
11492
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011493 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011494 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496 return result;
11497}
11498
INADA Naoki3ae20562017-01-16 20:41:20 +090011499/*[clinic input]
11500str.encode as unicode_encode
11501
11502 encoding: str(c_default="NULL") = 'utf-8'
11503 The encoding in which to encode the string.
11504 errors: str(c_default="NULL") = 'strict'
11505 The error handling scheme to use for encoding errors.
11506 The default is 'strict' meaning that encoding errors raise a
11507 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11508 'xmlcharrefreplace' as well as any other name registered with
11509 codecs.register_error that can handle UnicodeEncodeErrors.
11510
11511Encode the string using the codec registered for encoding.
11512[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513
11514static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011515unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011516/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011518 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011519}
11520
INADA Naoki3ae20562017-01-16 20:41:20 +090011521/*[clinic input]
11522str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523
INADA Naoki3ae20562017-01-16 20:41:20 +090011524 tabsize: int = 8
11525
11526Return a copy where all tab characters are expanded using spaces.
11527
11528If tabsize is not given, a tab size of 8 characters is assumed.
11529[clinic start generated code]*/
11530
11531static PyObject *
11532unicode_expandtabs_impl(PyObject *self, int tabsize)
11533/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011535 Py_ssize_t i, j, line_pos, src_len, incr;
11536 Py_UCS4 ch;
11537 PyObject *u;
11538 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011539 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011540 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541
Antoine Pitrou22425222011-10-04 19:10:51 +020011542 if (PyUnicode_READY(self) == -1)
11543 return NULL;
11544
Thomas Wouters7e474022000-07-16 12:04:32 +000011545 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011546 src_len = PyUnicode_GET_LENGTH(self);
11547 i = j = line_pos = 0;
11548 kind = PyUnicode_KIND(self);
11549 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011550 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011551 for (; i < src_len; i++) {
11552 ch = PyUnicode_READ(kind, src_data, i);
11553 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011554 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011555 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011556 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011557 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011558 goto overflow;
11559 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011560 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011561 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011562 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011564 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011565 goto overflow;
11566 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011567 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011568 if (ch == '\n' || ch == '\r')
11569 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011570 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011571 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011572 if (!found)
11573 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011574
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011576 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577 if (!u)
11578 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011579 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011580
Antoine Pitroue71d5742011-10-04 15:55:09 +020011581 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582
Antoine Pitroue71d5742011-10-04 15:55:09 +020011583 for (; i < src_len; i++) {
11584 ch = PyUnicode_READ(kind, src_data, i);
11585 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011586 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011587 incr = tabsize - (line_pos % tabsize);
11588 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011589 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011590 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011591 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011592 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011593 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011594 line_pos++;
11595 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011596 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011597 if (ch == '\n' || ch == '\r')
11598 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011600 }
11601 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011602 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011603
Antoine Pitroue71d5742011-10-04 15:55:09 +020011604 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011605 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11606 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011607}
11608
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011609PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011610 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611\n\
11612Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011613such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614arguments start and end are interpreted as in slice notation.\n\
11615\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011616Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617
11618static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011619unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011620{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011621 /* initialize variables to prevent gcc warning */
11622 PyObject *substring = NULL;
11623 Py_ssize_t start = 0;
11624 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011625 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011626
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011627 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011628 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011630 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011631 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011633 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011635 if (result == -2)
11636 return NULL;
11637
Christian Heimes217cfd12007-12-02 14:31:20 +000011638 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011639}
11640
11641static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011642unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011644 void *data;
11645 enum PyUnicode_Kind kind;
11646 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011647
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011648 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011649 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011650 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011651 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011652 if (PyUnicode_READY(self) == -1) {
11653 return NULL;
11654 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011655 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11656 PyErr_SetString(PyExc_IndexError, "string index out of range");
11657 return NULL;
11658 }
11659 kind = PyUnicode_KIND(self);
11660 data = PyUnicode_DATA(self);
11661 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011662 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011663}
11664
Guido van Rossumc2504932007-09-18 19:42:40 +000011665/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011666 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011667static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011668unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011669{
Guido van Rossumc2504932007-09-18 19:42:40 +000011670 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011671 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011672
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011673#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011674 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011675#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011676 if (_PyUnicode_HASH(self) != -1)
11677 return _PyUnicode_HASH(self);
11678 if (PyUnicode_READY(self) == -1)
11679 return -1;
11680 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011681 /*
11682 We make the hash of the empty string be 0, rather than using
11683 (prefix ^ suffix), since this slightly obfuscates the hash secret
11684 */
11685 if (len == 0) {
11686 _PyUnicode_HASH(self) = 0;
11687 return 0;
11688 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011689 x = _Py_HashBytes(PyUnicode_DATA(self),
11690 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011691 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011692 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011693}
11694
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011695PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011696 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697\n\
oldkaa0735f2018-02-02 16:52:55 +080011698Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011699such that sub is contained within S[start:end]. Optional\n\
11700arguments start and end are interpreted as in slice notation.\n\
11701\n\
11702Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703
11704static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011705unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011706{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011707 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011708 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011709 PyObject *substring = NULL;
11710 Py_ssize_t start = 0;
11711 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011713 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011714 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011715
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011716 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011717 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011718
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011719 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011721 if (result == -2)
11722 return NULL;
11723
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724 if (result < 0) {
11725 PyErr_SetString(PyExc_ValueError, "substring not found");
11726 return NULL;
11727 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011728
Christian Heimes217cfd12007-12-02 14:31:20 +000011729 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011730}
11731
INADA Naoki3ae20562017-01-16 20:41:20 +090011732/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011733str.isascii as unicode_isascii
11734
11735Return True if all characters in the string are ASCII, False otherwise.
11736
11737ASCII characters have code points in the range U+0000-U+007F.
11738Empty string is ASCII too.
11739[clinic start generated code]*/
11740
11741static PyObject *
11742unicode_isascii_impl(PyObject *self)
11743/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11744{
11745 if (PyUnicode_READY(self) == -1) {
11746 return NULL;
11747 }
11748 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11749}
11750
11751/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011752str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753
INADA Naoki3ae20562017-01-16 20:41:20 +090011754Return True if the string is a lowercase string, False otherwise.
11755
11756A string is lowercase if all cased characters in the string are lowercase and
11757there is at least one cased character in the string.
11758[clinic start generated code]*/
11759
11760static PyObject *
11761unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011762/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011764 Py_ssize_t i, length;
11765 int kind;
11766 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767 int cased;
11768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769 if (PyUnicode_READY(self) == -1)
11770 return NULL;
11771 length = PyUnicode_GET_LENGTH(self);
11772 kind = PyUnicode_KIND(self);
11773 data = PyUnicode_DATA(self);
11774
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011776 if (length == 1)
11777 return PyBool_FromLong(
11778 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011780 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011781 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011782 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011783
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 for (i = 0; i < length; i++) {
11786 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011787
Benjamin Peterson29060642009-01-31 22:14:21 +000011788 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011789 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011790 else if (!cased && Py_UNICODE_ISLOWER(ch))
11791 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011793 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011794}
11795
INADA Naoki3ae20562017-01-16 20:41:20 +090011796/*[clinic input]
11797str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798
INADA Naoki3ae20562017-01-16 20:41:20 +090011799Return True if the string is an uppercase string, False otherwise.
11800
11801A string is uppercase if all cased characters in the string are uppercase and
11802there is at least one cased character in the string.
11803[clinic start generated code]*/
11804
11805static PyObject *
11806unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011807/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011809 Py_ssize_t i, length;
11810 int kind;
11811 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812 int cased;
11813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011814 if (PyUnicode_READY(self) == -1)
11815 return NULL;
11816 length = PyUnicode_GET_LENGTH(self);
11817 kind = PyUnicode_KIND(self);
11818 data = PyUnicode_DATA(self);
11819
Guido van Rossumd57fd912000-03-10 22:53:23 +000011820 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011821 if (length == 1)
11822 return PyBool_FromLong(
11823 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011825 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011826 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011827 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011828
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011830 for (i = 0; i < length; i++) {
11831 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011832
Benjamin Peterson29060642009-01-31 22:14:21 +000011833 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011834 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011835 else if (!cased && Py_UNICODE_ISUPPER(ch))
11836 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011838 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839}
11840
INADA Naoki3ae20562017-01-16 20:41:20 +090011841/*[clinic input]
11842str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843
INADA Naoki3ae20562017-01-16 20:41:20 +090011844Return True if the string is a title-cased string, False otherwise.
11845
11846In a title-cased string, upper- and title-case characters may only
11847follow uncased characters and lowercase characters only cased ones.
11848[clinic start generated code]*/
11849
11850static PyObject *
11851unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011852/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011854 Py_ssize_t i, length;
11855 int kind;
11856 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011857 int cased, previous_is_cased;
11858
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011859 if (PyUnicode_READY(self) == -1)
11860 return NULL;
11861 length = PyUnicode_GET_LENGTH(self);
11862 kind = PyUnicode_KIND(self);
11863 data = PyUnicode_DATA(self);
11864
Guido van Rossumd57fd912000-03-10 22:53:23 +000011865 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011866 if (length == 1) {
11867 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11868 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11869 (Py_UNICODE_ISUPPER(ch) != 0));
11870 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011872 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011873 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011874 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011875
Guido van Rossumd57fd912000-03-10 22:53:23 +000011876 cased = 0;
11877 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011878 for (i = 0; i < length; i++) {
11879 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011880
Benjamin Peterson29060642009-01-31 22:14:21 +000011881 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11882 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011883 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011884 previous_is_cased = 1;
11885 cased = 1;
11886 }
11887 else if (Py_UNICODE_ISLOWER(ch)) {
11888 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011889 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011890 previous_is_cased = 1;
11891 cased = 1;
11892 }
11893 else
11894 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011896 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897}
11898
INADA Naoki3ae20562017-01-16 20:41:20 +090011899/*[clinic input]
11900str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901
INADA Naoki3ae20562017-01-16 20:41:20 +090011902Return True if the string is a whitespace string, False otherwise.
11903
11904A string is whitespace if all characters in the string are whitespace and there
11905is at least one character in the string.
11906[clinic start generated code]*/
11907
11908static PyObject *
11909unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011910/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011912 Py_ssize_t i, length;
11913 int kind;
11914 void *data;
11915
11916 if (PyUnicode_READY(self) == -1)
11917 return NULL;
11918 length = PyUnicode_GET_LENGTH(self);
11919 kind = PyUnicode_KIND(self);
11920 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011923 if (length == 1)
11924 return PyBool_FromLong(
11925 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011927 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011928 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011929 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011931 for (i = 0; i < length; i++) {
11932 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011933 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011934 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011935 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011936 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937}
11938
INADA Naoki3ae20562017-01-16 20:41:20 +090011939/*[clinic input]
11940str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011941
INADA Naoki3ae20562017-01-16 20:41:20 +090011942Return True if the string is an alphabetic string, False otherwise.
11943
11944A string is alphabetic if all characters in the string are alphabetic and there
11945is at least one character in the string.
11946[clinic start generated code]*/
11947
11948static PyObject *
11949unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011950/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011951{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011952 Py_ssize_t i, length;
11953 int kind;
11954 void *data;
11955
11956 if (PyUnicode_READY(self) == -1)
11957 return NULL;
11958 length = PyUnicode_GET_LENGTH(self);
11959 kind = PyUnicode_KIND(self);
11960 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011961
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011962 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011963 if (length == 1)
11964 return PyBool_FromLong(
11965 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011966
11967 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011968 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011969 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 for (i = 0; i < length; i++) {
11972 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011973 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011974 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011975 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011976}
11977
INADA Naoki3ae20562017-01-16 20:41:20 +090011978/*[clinic input]
11979str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011980
INADA Naoki3ae20562017-01-16 20:41:20 +090011981Return True if the string is an alpha-numeric string, False otherwise.
11982
11983A string is alpha-numeric if all characters in the string are alpha-numeric and
11984there is at least one character in the string.
11985[clinic start generated code]*/
11986
11987static PyObject *
11988unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011989/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011990{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 int kind;
11992 void *data;
11993 Py_ssize_t len, i;
11994
11995 if (PyUnicode_READY(self) == -1)
11996 return NULL;
11997
11998 kind = PyUnicode_KIND(self);
11999 data = PyUnicode_DATA(self);
12000 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012001
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012002 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003 if (len == 1) {
12004 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12005 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12006 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012007
12008 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012009 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012010 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012011
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 for (i = 0; i < len; i++) {
12013 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012014 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012015 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012016 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012017 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012018}
12019
INADA Naoki3ae20562017-01-16 20:41:20 +090012020/*[clinic input]
12021str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012022
INADA Naoki3ae20562017-01-16 20:41:20 +090012023Return True if the string is a decimal string, False otherwise.
12024
12025A string is a decimal string if all characters in the string are decimal and
12026there is at least one character in the string.
12027[clinic start generated code]*/
12028
12029static PyObject *
12030unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012031/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012032{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012033 Py_ssize_t i, length;
12034 int kind;
12035 void *data;
12036
12037 if (PyUnicode_READY(self) == -1)
12038 return NULL;
12039 length = PyUnicode_GET_LENGTH(self);
12040 kind = PyUnicode_KIND(self);
12041 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012042
Guido van Rossumd57fd912000-03-10 22:53:23 +000012043 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012044 if (length == 1)
12045 return PyBool_FromLong(
12046 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012048 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012049 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012050 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012052 for (i = 0; i < length; i++) {
12053 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012054 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012055 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012056 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012057}
12058
INADA Naoki3ae20562017-01-16 20:41:20 +090012059/*[clinic input]
12060str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012061
INADA Naoki3ae20562017-01-16 20:41:20 +090012062Return True if the string is a digit string, False otherwise.
12063
12064A string is a digit string if all characters in the string are digits and there
12065is at least one character in the string.
12066[clinic start generated code]*/
12067
12068static PyObject *
12069unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012070/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012071{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012072 Py_ssize_t i, length;
12073 int kind;
12074 void *data;
12075
12076 if (PyUnicode_READY(self) == -1)
12077 return NULL;
12078 length = PyUnicode_GET_LENGTH(self);
12079 kind = PyUnicode_KIND(self);
12080 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081
Guido van Rossumd57fd912000-03-10 22:53:23 +000012082 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012083 if (length == 1) {
12084 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12085 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12086 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012088 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012089 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012090 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012092 for (i = 0; i < length; i++) {
12093 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012094 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012096 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097}
12098
INADA Naoki3ae20562017-01-16 20:41:20 +090012099/*[clinic input]
12100str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012101
INADA Naoki3ae20562017-01-16 20:41:20 +090012102Return True if the string is a numeric string, False otherwise.
12103
12104A string is numeric if all characters in the string are numeric and there is at
12105least one character in the string.
12106[clinic start generated code]*/
12107
12108static PyObject *
12109unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012110/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012111{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012112 Py_ssize_t i, length;
12113 int kind;
12114 void *data;
12115
12116 if (PyUnicode_READY(self) == -1)
12117 return NULL;
12118 length = PyUnicode_GET_LENGTH(self);
12119 kind = PyUnicode_KIND(self);
12120 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121
Guido van Rossumd57fd912000-03-10 22:53:23 +000012122 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012123 if (length == 1)
12124 return PyBool_FromLong(
12125 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012126
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012127 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012128 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012129 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012131 for (i = 0; i < length; i++) {
12132 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012133 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012134 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012135 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012136}
12137
Martin v. Löwis47383402007-08-15 07:32:56 +000012138int
12139PyUnicode_IsIdentifier(PyObject *self)
12140{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141 int kind;
12142 void *data;
12143 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012144 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012146 if (PyUnicode_READY(self) == -1) {
12147 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012148 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 }
12150
12151 /* Special case for empty strings */
12152 if (PyUnicode_GET_LENGTH(self) == 0)
12153 return 0;
12154 kind = PyUnicode_KIND(self);
12155 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012156
12157 /* PEP 3131 says that the first character must be in
12158 XID_Start and subsequent characters in XID_Continue,
12159 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012160 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012161 letters, digits, underscore). However, given the current
12162 definition of XID_Start and XID_Continue, it is sufficient
12163 to check just for these, except that _ must be allowed
12164 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012165 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012166 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012167 return 0;
12168
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012169 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012170 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012171 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012172 return 1;
12173}
12174
INADA Naoki3ae20562017-01-16 20:41:20 +090012175/*[clinic input]
12176str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012177
INADA Naoki3ae20562017-01-16 20:41:20 +090012178Return True if the string is a valid Python identifier, False otherwise.
12179
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012180Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012181such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012182[clinic start generated code]*/
12183
12184static PyObject *
12185unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012186/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012187{
12188 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12189}
12190
INADA Naoki3ae20562017-01-16 20:41:20 +090012191/*[clinic input]
12192str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012193
INADA Naoki3ae20562017-01-16 20:41:20 +090012194Return True if the string is printable, False otherwise.
12195
12196A string is printable if all of its characters are considered printable in
12197repr() or if it is empty.
12198[clinic start generated code]*/
12199
12200static PyObject *
12201unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012202/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012203{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012204 Py_ssize_t i, length;
12205 int kind;
12206 void *data;
12207
12208 if (PyUnicode_READY(self) == -1)
12209 return NULL;
12210 length = PyUnicode_GET_LENGTH(self);
12211 kind = PyUnicode_KIND(self);
12212 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012213
12214 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012215 if (length == 1)
12216 return PyBool_FromLong(
12217 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012219 for (i = 0; i < length; i++) {
12220 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012221 Py_RETURN_FALSE;
12222 }
12223 }
12224 Py_RETURN_TRUE;
12225}
12226
INADA Naoki3ae20562017-01-16 20:41:20 +090012227/*[clinic input]
12228str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012229
INADA Naoki3ae20562017-01-16 20:41:20 +090012230 iterable: object
12231 /
12232
12233Concatenate any number of strings.
12234
Martin Panter91a88662017-01-24 00:30:06 +000012235The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012236The result is returned as a new string.
12237
12238Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12239[clinic start generated code]*/
12240
12241static PyObject *
12242unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012243/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244{
INADA Naoki3ae20562017-01-16 20:41:20 +090012245 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012246}
12247
Martin v. Löwis18e16552006-02-15 17:27:45 +000012248static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012249unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012250{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012251 if (PyUnicode_READY(self) == -1)
12252 return -1;
12253 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012254}
12255
INADA Naoki3ae20562017-01-16 20:41:20 +090012256/*[clinic input]
12257str.ljust as unicode_ljust
12258
12259 width: Py_ssize_t
12260 fillchar: Py_UCS4 = ' '
12261 /
12262
12263Return a left-justified string of length width.
12264
12265Padding is done using the specified fill character (default is a space).
12266[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267
12268static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012269unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12270/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012272 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012273 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274
Victor Stinnerc4b49542011-12-11 22:44:26 +010012275 if (PyUnicode_GET_LENGTH(self) >= width)
12276 return unicode_result_unchanged(self);
12277
12278 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279}
12280
INADA Naoki3ae20562017-01-16 20:41:20 +090012281/*[clinic input]
12282str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283
INADA Naoki3ae20562017-01-16 20:41:20 +090012284Return a copy of the string converted to lowercase.
12285[clinic start generated code]*/
12286
12287static PyObject *
12288unicode_lower_impl(PyObject *self)
12289/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012291 if (PyUnicode_READY(self) == -1)
12292 return NULL;
12293 if (PyUnicode_IS_ASCII(self))
12294 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012295 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296}
12297
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012298#define LEFTSTRIP 0
12299#define RIGHTSTRIP 1
12300#define BOTHSTRIP 2
12301
12302/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012303static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012304
INADA Naoki3ae20562017-01-16 20:41:20 +090012305#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012306
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012307/* externally visible for str.strip(unicode) */
12308PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012309_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012310{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012311 void *data;
12312 int kind;
12313 Py_ssize_t i, j, len;
12314 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012315 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012316
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012317 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12318 return NULL;
12319
12320 kind = PyUnicode_KIND(self);
12321 data = PyUnicode_DATA(self);
12322 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012323 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012324 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12325 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012326 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012327
Benjamin Peterson14339b62009-01-31 16:36:08 +000012328 i = 0;
12329 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012330 while (i < len) {
12331 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12332 if (!BLOOM(sepmask, ch))
12333 break;
12334 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12335 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012336 i++;
12337 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012338 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012339
Benjamin Peterson14339b62009-01-31 16:36:08 +000012340 j = len;
12341 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012342 j--;
12343 while (j >= i) {
12344 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12345 if (!BLOOM(sepmask, ch))
12346 break;
12347 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12348 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012349 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012350 }
12351
Benjamin Peterson29060642009-01-31 22:14:21 +000012352 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012353 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012354
Victor Stinner7931d9a2011-11-04 00:22:48 +010012355 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012356}
12357
12358PyObject*
12359PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12360{
12361 unsigned char *data;
12362 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012363 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012364
Victor Stinnerde636f32011-10-01 03:55:54 +020012365 if (PyUnicode_READY(self) == -1)
12366 return NULL;
12367
Victor Stinner684d5fd2012-05-03 02:32:34 +020012368 length = PyUnicode_GET_LENGTH(self);
12369 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012370
Victor Stinner684d5fd2012-05-03 02:32:34 +020012371 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012372 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012373
Victor Stinnerde636f32011-10-01 03:55:54 +020012374 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012375 PyErr_SetString(PyExc_IndexError, "string index out of range");
12376 return NULL;
12377 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012378 if (start >= length || end < start)
12379 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012380
Victor Stinner684d5fd2012-05-03 02:32:34 +020012381 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012382 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012383 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012384 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012385 }
12386 else {
12387 kind = PyUnicode_KIND(self);
12388 data = PyUnicode_1BYTE_DATA(self);
12389 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012390 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012391 length);
12392 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012393}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012394
12395static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012396do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012397{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012398 Py_ssize_t len, i, j;
12399
12400 if (PyUnicode_READY(self) == -1)
12401 return NULL;
12402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012403 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012404
Victor Stinnercc7af722013-04-09 22:39:24 +020012405 if (PyUnicode_IS_ASCII(self)) {
12406 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12407
12408 i = 0;
12409 if (striptype != RIGHTSTRIP) {
12410 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012411 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012412 if (!_Py_ascii_whitespace[ch])
12413 break;
12414 i++;
12415 }
12416 }
12417
12418 j = len;
12419 if (striptype != LEFTSTRIP) {
12420 j--;
12421 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012422 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012423 if (!_Py_ascii_whitespace[ch])
12424 break;
12425 j--;
12426 }
12427 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012428 }
12429 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012430 else {
12431 int kind = PyUnicode_KIND(self);
12432 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012433
Victor Stinnercc7af722013-04-09 22:39:24 +020012434 i = 0;
12435 if (striptype != RIGHTSTRIP) {
12436 while (i < len) {
12437 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12438 if (!Py_UNICODE_ISSPACE(ch))
12439 break;
12440 i++;
12441 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012442 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012443
12444 j = len;
12445 if (striptype != LEFTSTRIP) {
12446 j--;
12447 while (j >= i) {
12448 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12449 if (!Py_UNICODE_ISSPACE(ch))
12450 break;
12451 j--;
12452 }
12453 j++;
12454 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012455 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012456
Victor Stinner7931d9a2011-11-04 00:22:48 +010012457 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012458}
12459
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012460
12461static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012462do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012463{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012464 if (sep != NULL && sep != Py_None) {
12465 if (PyUnicode_Check(sep))
12466 return _PyUnicode_XStrip(self, striptype, sep);
12467 else {
12468 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012469 "%s arg must be None or str",
12470 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012471 return NULL;
12472 }
12473 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012474
Benjamin Peterson14339b62009-01-31 16:36:08 +000012475 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012476}
12477
12478
INADA Naoki3ae20562017-01-16 20:41:20 +090012479/*[clinic input]
12480str.strip as unicode_strip
12481
12482 chars: object = None
12483 /
12484
Victor Stinner0c4a8282017-01-17 02:21:47 +010012485Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012486
12487If chars is given and not None, remove characters in chars instead.
12488[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012489
12490static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012491unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012492/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012493{
INADA Naoki3ae20562017-01-16 20:41:20 +090012494 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012495}
12496
12497
INADA Naoki3ae20562017-01-16 20:41:20 +090012498/*[clinic input]
12499str.lstrip as unicode_lstrip
12500
12501 chars: object = NULL
12502 /
12503
12504Return a copy of the string with leading whitespace removed.
12505
12506If chars is given and not None, remove characters in chars instead.
12507[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012508
12509static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012510unicode_lstrip_impl(PyObject *self, PyObject *chars)
12511/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012512{
INADA Naoki3ae20562017-01-16 20:41:20 +090012513 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012514}
12515
12516
INADA Naoki3ae20562017-01-16 20:41:20 +090012517/*[clinic input]
12518str.rstrip as unicode_rstrip
12519
12520 chars: object = NULL
12521 /
12522
12523Return a copy of the string with trailing whitespace removed.
12524
12525If chars is given and not None, remove characters in chars instead.
12526[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012527
12528static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012529unicode_rstrip_impl(PyObject *self, PyObject *chars)
12530/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012531{
INADA Naoki3ae20562017-01-16 20:41:20 +090012532 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012533}
12534
12535
Guido van Rossumd57fd912000-03-10 22:53:23 +000012536static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012537unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012538{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012539 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012540 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012541
Serhiy Storchaka05997252013-01-26 12:14:02 +020012542 if (len < 1)
12543 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012544
Victor Stinnerc4b49542011-12-11 22:44:26 +010012545 /* no repeat, return original string */
12546 if (len == 1)
12547 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012548
Benjamin Petersonbac79492012-01-14 13:34:47 -050012549 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 return NULL;
12551
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012552 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012553 PyErr_SetString(PyExc_OverflowError,
12554 "repeated string is too long");
12555 return NULL;
12556 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012558
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012559 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012560 if (!u)
12561 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012562 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012564 if (PyUnicode_GET_LENGTH(str) == 1) {
12565 const int kind = PyUnicode_KIND(str);
12566 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012567 if (kind == PyUnicode_1BYTE_KIND) {
12568 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012569 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012570 }
12571 else if (kind == PyUnicode_2BYTE_KIND) {
12572 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012573 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012574 ucs2[n] = fill_char;
12575 } else {
12576 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12577 assert(kind == PyUnicode_4BYTE_KIND);
12578 for (n = 0; n < len; ++n)
12579 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012580 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012581 }
12582 else {
12583 /* number of characters copied this far */
12584 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012585 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012586 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012587 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012588 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012589 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012590 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012591 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012592 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012593 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012594 }
12595
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012596 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012597 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012598}
12599
Alexander Belopolsky40018472011-02-26 01:02:56 +000012600PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012601PyUnicode_Replace(PyObject *str,
12602 PyObject *substr,
12603 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012604 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012605{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012606 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12607 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012608 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012609 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012610}
12611
INADA Naoki3ae20562017-01-16 20:41:20 +090012612/*[clinic input]
12613str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012614
INADA Naoki3ae20562017-01-16 20:41:20 +090012615 old: unicode
12616 new: unicode
12617 count: Py_ssize_t = -1
12618 Maximum number of occurrences to replace.
12619 -1 (the default value) means replace all occurrences.
12620 /
12621
12622Return a copy with all occurrences of substring old replaced by new.
12623
12624If the optional argument count is given, only the first count occurrences are
12625replaced.
12626[clinic start generated code]*/
12627
12628static PyObject *
12629unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12630 Py_ssize_t count)
12631/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012632{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012633 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012634 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012635 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012636}
12637
Alexander Belopolsky40018472011-02-26 01:02:56 +000012638static PyObject *
12639unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012640{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012641 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012642 Py_ssize_t isize;
12643 Py_ssize_t osize, squote, dquote, i, o;
12644 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012645 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012646 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012648 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012649 return NULL;
12650
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012651 isize = PyUnicode_GET_LENGTH(unicode);
12652 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012654 /* Compute length of output, quote characters, and
12655 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012656 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012657 max = 127;
12658 squote = dquote = 0;
12659 ikind = PyUnicode_KIND(unicode);
12660 for (i = 0; i < isize; i++) {
12661 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012662 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012663 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012664 case '\'': squote++; break;
12665 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012666 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012667 incr = 2;
12668 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012669 default:
12670 /* Fast-path ASCII */
12671 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012672 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012674 ;
12675 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012676 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012677 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012678 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012679 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012680 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012682 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012683 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012684 if (osize > PY_SSIZE_T_MAX - incr) {
12685 PyErr_SetString(PyExc_OverflowError,
12686 "string is too long to generate repr");
12687 return NULL;
12688 }
12689 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012690 }
12691
12692 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012693 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012694 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012695 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012696 if (dquote)
12697 /* Both squote and dquote present. Use squote,
12698 and escape them */
12699 osize += squote;
12700 else
12701 quote = '"';
12702 }
Victor Stinner55c08782013-04-14 18:45:39 +020012703 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012704
12705 repr = PyUnicode_New(osize, max);
12706 if (repr == NULL)
12707 return NULL;
12708 okind = PyUnicode_KIND(repr);
12709 odata = PyUnicode_DATA(repr);
12710
12711 PyUnicode_WRITE(okind, odata, 0, quote);
12712 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012713 if (unchanged) {
12714 _PyUnicode_FastCopyCharacters(repr, 1,
12715 unicode, 0,
12716 isize);
12717 }
12718 else {
12719 for (i = 0, o = 1; i < isize; i++) {
12720 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012721
Victor Stinner55c08782013-04-14 18:45:39 +020012722 /* Escape quotes and backslashes */
12723 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012724 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012725 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012726 continue;
12727 }
12728
12729 /* Map special whitespace to '\t', \n', '\r' */
12730 if (ch == '\t') {
12731 PyUnicode_WRITE(okind, odata, o++, '\\');
12732 PyUnicode_WRITE(okind, odata, o++, 't');
12733 }
12734 else if (ch == '\n') {
12735 PyUnicode_WRITE(okind, odata, o++, '\\');
12736 PyUnicode_WRITE(okind, odata, o++, 'n');
12737 }
12738 else if (ch == '\r') {
12739 PyUnicode_WRITE(okind, odata, o++, '\\');
12740 PyUnicode_WRITE(okind, odata, o++, 'r');
12741 }
12742
12743 /* Map non-printable US ASCII to '\xhh' */
12744 else if (ch < ' ' || ch == 0x7F) {
12745 PyUnicode_WRITE(okind, odata, o++, '\\');
12746 PyUnicode_WRITE(okind, odata, o++, 'x');
12747 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12748 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12749 }
12750
12751 /* Copy ASCII characters as-is */
12752 else if (ch < 0x7F) {
12753 PyUnicode_WRITE(okind, odata, o++, ch);
12754 }
12755
12756 /* Non-ASCII characters */
12757 else {
12758 /* Map Unicode whitespace and control characters
12759 (categories Z* and C* except ASCII space)
12760 */
12761 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12762 PyUnicode_WRITE(okind, odata, o++, '\\');
12763 /* Map 8-bit characters to '\xhh' */
12764 if (ch <= 0xff) {
12765 PyUnicode_WRITE(okind, odata, o++, 'x');
12766 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12767 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12768 }
12769 /* Map 16-bit characters to '\uxxxx' */
12770 else if (ch <= 0xffff) {
12771 PyUnicode_WRITE(okind, odata, o++, 'u');
12772 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12773 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12774 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12775 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12776 }
12777 /* Map 21-bit characters to '\U00xxxxxx' */
12778 else {
12779 PyUnicode_WRITE(okind, odata, o++, 'U');
12780 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12781 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12782 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12783 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12784 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12785 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12786 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12787 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12788 }
12789 }
12790 /* Copy characters as-is */
12791 else {
12792 PyUnicode_WRITE(okind, odata, o++, ch);
12793 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012794 }
12795 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012796 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012797 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012798 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012799 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012800}
12801
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012802PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012803 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012804\n\
12805Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012806such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012807arguments start and end are interpreted as in slice notation.\n\
12808\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012809Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012810
12811static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012812unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012813{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012814 /* initialize variables to prevent gcc warning */
12815 PyObject *substring = NULL;
12816 Py_ssize_t start = 0;
12817 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012818 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012819
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012820 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012821 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012822
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012823 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012824 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012825
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012826 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012828 if (result == -2)
12829 return NULL;
12830
Christian Heimes217cfd12007-12-02 14:31:20 +000012831 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012832}
12833
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012834PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012835 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012836\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012837Return the highest index in S where substring sub is found,\n\
12838such that sub is contained within S[start:end]. Optional\n\
12839arguments start and end are interpreted as in slice notation.\n\
12840\n\
12841Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012842
12843static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012844unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012845{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012846 /* initialize variables to prevent gcc warning */
12847 PyObject *substring = NULL;
12848 Py_ssize_t start = 0;
12849 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012850 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012851
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012852 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012853 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012854
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012855 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012856 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012857
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012858 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012860 if (result == -2)
12861 return NULL;
12862
Guido van Rossumd57fd912000-03-10 22:53:23 +000012863 if (result < 0) {
12864 PyErr_SetString(PyExc_ValueError, "substring not found");
12865 return NULL;
12866 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012867
Christian Heimes217cfd12007-12-02 14:31:20 +000012868 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012869}
12870
INADA Naoki3ae20562017-01-16 20:41:20 +090012871/*[clinic input]
12872str.rjust as unicode_rjust
12873
12874 width: Py_ssize_t
12875 fillchar: Py_UCS4 = ' '
12876 /
12877
12878Return a right-justified string of length width.
12879
12880Padding is done using the specified fill character (default is a space).
12881[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012882
12883static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012884unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12885/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012886{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012887 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012888 return NULL;
12889
Victor Stinnerc4b49542011-12-11 22:44:26 +010012890 if (PyUnicode_GET_LENGTH(self) >= width)
12891 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012892
Victor Stinnerc4b49542011-12-11 22:44:26 +010012893 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012894}
12895
Alexander Belopolsky40018472011-02-26 01:02:56 +000012896PyObject *
12897PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012898{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012899 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012900 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012901
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012902 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012903}
12904
INADA Naoki3ae20562017-01-16 20:41:20 +090012905/*[clinic input]
12906str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012907
INADA Naoki3ae20562017-01-16 20:41:20 +090012908 sep: object = None
12909 The delimiter according which to split the string.
12910 None (the default value) means split according to any whitespace,
12911 and discard empty strings from the result.
12912 maxsplit: Py_ssize_t = -1
12913 Maximum number of splits to do.
12914 -1 (the default value) means no limit.
12915
12916Return a list of the words in the string, using sep as the delimiter string.
12917[clinic start generated code]*/
12918
12919static PyObject *
12920unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12921/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012922{
INADA Naoki3ae20562017-01-16 20:41:20 +090012923 if (sep == Py_None)
12924 return split(self, NULL, maxsplit);
12925 if (PyUnicode_Check(sep))
12926 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012927
Victor Stinner998b8062018-09-12 00:23:25 +020012928 PyErr_Format(PyExc_TypeError,
12929 "must be str or None, not %.100s",
12930 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012931 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012932}
12933
Thomas Wouters477c8d52006-05-27 19:21:47 +000012934PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012935PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012936{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012937 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012938 int kind1, kind2;
12939 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012940 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012941
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012942 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012943 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012944
Victor Stinner14f8f022011-10-05 20:58:25 +020012945 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012946 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012947 len1 = PyUnicode_GET_LENGTH(str_obj);
12948 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012949 if (kind1 < kind2 || len1 < len2) {
12950 _Py_INCREF_UNICODE_EMPTY();
12951 if (!unicode_empty)
12952 out = NULL;
12953 else {
12954 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12955 Py_DECREF(unicode_empty);
12956 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012957 return out;
12958 }
12959 buf1 = PyUnicode_DATA(str_obj);
12960 buf2 = PyUnicode_DATA(sep_obj);
12961 if (kind2 != kind1) {
12962 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12963 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012964 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012965 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012966
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012967 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012968 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012969 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12970 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12971 else
12972 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012973 break;
12974 case PyUnicode_2BYTE_KIND:
12975 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12976 break;
12977 case PyUnicode_4BYTE_KIND:
12978 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12979 break;
12980 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012981 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012982 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012983
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012984 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012985 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012986
12987 return out;
12988}
12989
12990
12991PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012992PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012993{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012994 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012995 int kind1, kind2;
12996 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012997 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012998
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012999 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013000 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013001
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013002 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013003 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013004 len1 = PyUnicode_GET_LENGTH(str_obj);
13005 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013006 if (kind1 < kind2 || len1 < len2) {
13007 _Py_INCREF_UNICODE_EMPTY();
13008 if (!unicode_empty)
13009 out = NULL;
13010 else {
13011 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13012 Py_DECREF(unicode_empty);
13013 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013014 return out;
13015 }
13016 buf1 = PyUnicode_DATA(str_obj);
13017 buf2 = PyUnicode_DATA(sep_obj);
13018 if (kind2 != kind1) {
13019 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13020 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013021 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013022 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013023
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013024 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013025 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013026 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13027 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13028 else
13029 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013030 break;
13031 case PyUnicode_2BYTE_KIND:
13032 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13033 break;
13034 case PyUnicode_4BYTE_KIND:
13035 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13036 break;
13037 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013038 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013039 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013040
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013041 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013042 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013043
13044 return out;
13045}
13046
INADA Naoki3ae20562017-01-16 20:41:20 +090013047/*[clinic input]
13048str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013049
INADA Naoki3ae20562017-01-16 20:41:20 +090013050 sep: object
13051 /
13052
13053Partition the string into three parts using the given separator.
13054
13055This will search for the separator in the string. If the separator is found,
13056returns a 3-tuple containing the part before the separator, the separator
13057itself, and the part after it.
13058
13059If the separator is not found, returns a 3-tuple containing the original string
13060and two empty strings.
13061[clinic start generated code]*/
13062
13063static PyObject *
13064unicode_partition(PyObject *self, PyObject *sep)
13065/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013066{
INADA Naoki3ae20562017-01-16 20:41:20 +090013067 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013068}
13069
INADA Naoki3ae20562017-01-16 20:41:20 +090013070/*[clinic input]
13071str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013072
INADA Naoki3ae20562017-01-16 20:41:20 +090013073Partition the string into three parts using the given separator.
13074
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013075This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013076the separator is found, returns a 3-tuple containing the part before the
13077separator, the separator itself, and the part after it.
13078
13079If the separator is not found, returns a 3-tuple containing two empty strings
13080and the original string.
13081[clinic start generated code]*/
13082
13083static PyObject *
13084unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013085/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013086{
INADA Naoki3ae20562017-01-16 20:41:20 +090013087 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013088}
13089
Alexander Belopolsky40018472011-02-26 01:02:56 +000013090PyObject *
13091PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013092{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013093 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013094 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013095
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013096 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013097}
13098
INADA Naoki3ae20562017-01-16 20:41:20 +090013099/*[clinic input]
13100str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013101
INADA Naoki3ae20562017-01-16 20:41:20 +090013102Return a list of the words in the string, using sep as the delimiter string.
13103
13104Splits are done starting at the end of the string and working to the front.
13105[clinic start generated code]*/
13106
13107static PyObject *
13108unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13109/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013110{
INADA Naoki3ae20562017-01-16 20:41:20 +090013111 if (sep == Py_None)
13112 return rsplit(self, NULL, maxsplit);
13113 if (PyUnicode_Check(sep))
13114 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013115
Victor Stinner998b8062018-09-12 00:23:25 +020013116 PyErr_Format(PyExc_TypeError,
13117 "must be str or None, not %.100s",
13118 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013119 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013120}
13121
INADA Naoki3ae20562017-01-16 20:41:20 +090013122/*[clinic input]
13123str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013125 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013126
13127Return a list of the lines in the string, breaking at line boundaries.
13128
13129Line breaks are not included in the resulting list unless keepends is given and
13130true.
13131[clinic start generated code]*/
13132
13133static PyObject *
13134unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013135/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013137 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013138}
13139
13140static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013141PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013142{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013143 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013144}
13145
INADA Naoki3ae20562017-01-16 20:41:20 +090013146/*[clinic input]
13147str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013148
INADA Naoki3ae20562017-01-16 20:41:20 +090013149Convert uppercase characters to lowercase and lowercase characters to uppercase.
13150[clinic start generated code]*/
13151
13152static PyObject *
13153unicode_swapcase_impl(PyObject *self)
13154/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013155{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013156 if (PyUnicode_READY(self) == -1)
13157 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013158 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013159}
13160
Larry Hastings61272b72014-01-07 12:41:53 -080013161/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013162
Larry Hastings31826802013-10-19 00:09:25 -070013163@staticmethod
13164str.maketrans as unicode_maketrans
13165
13166 x: object
13167
13168 y: unicode=NULL
13169
13170 z: unicode=NULL
13171
13172 /
13173
13174Return a translation table usable for str.translate().
13175
13176If there is only one argument, it must be a dictionary mapping Unicode
13177ordinals (integers) or characters to Unicode ordinals, strings or None.
13178Character keys will be then converted to ordinals.
13179If there are two arguments, they must be strings of equal length, and
13180in the resulting dictionary, each character in x will be mapped to the
13181character at the same position in y. If there is a third argument, it
13182must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013183[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013184
Larry Hastings31826802013-10-19 00:09:25 -070013185static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013186unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013187/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013188{
Georg Brandlceee0772007-11-27 23:48:05 +000013189 PyObject *new = NULL, *key, *value;
13190 Py_ssize_t i = 0;
13191 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013192
Georg Brandlceee0772007-11-27 23:48:05 +000013193 new = PyDict_New();
13194 if (!new)
13195 return NULL;
13196 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013197 int x_kind, y_kind, z_kind;
13198 void *x_data, *y_data, *z_data;
13199
Georg Brandlceee0772007-11-27 23:48:05 +000013200 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013201 if (!PyUnicode_Check(x)) {
13202 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13203 "be a string if there is a second argument");
13204 goto err;
13205 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013206 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013207 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13208 "arguments must have equal length");
13209 goto err;
13210 }
13211 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013212 x_kind = PyUnicode_KIND(x);
13213 y_kind = PyUnicode_KIND(y);
13214 x_data = PyUnicode_DATA(x);
13215 y_data = PyUnicode_DATA(y);
13216 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13217 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013218 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013219 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013220 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013221 if (!value) {
13222 Py_DECREF(key);
13223 goto err;
13224 }
Georg Brandlceee0772007-11-27 23:48:05 +000013225 res = PyDict_SetItem(new, key, value);
13226 Py_DECREF(key);
13227 Py_DECREF(value);
13228 if (res < 0)
13229 goto err;
13230 }
13231 /* create entries for deleting chars in z */
13232 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013233 z_kind = PyUnicode_KIND(z);
13234 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013235 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013236 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013237 if (!key)
13238 goto err;
13239 res = PyDict_SetItem(new, key, Py_None);
13240 Py_DECREF(key);
13241 if (res < 0)
13242 goto err;
13243 }
13244 }
13245 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013246 int kind;
13247 void *data;
13248
Georg Brandlceee0772007-11-27 23:48:05 +000013249 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013250 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013251 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13252 "to maketrans it must be a dict");
13253 goto err;
13254 }
13255 /* copy entries into the new dict, converting string keys to int keys */
13256 while (PyDict_Next(x, &i, &key, &value)) {
13257 if (PyUnicode_Check(key)) {
13258 /* convert string keys to integer keys */
13259 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013260 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013261 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13262 "table must be of length 1");
13263 goto err;
13264 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013265 kind = PyUnicode_KIND(key);
13266 data = PyUnicode_DATA(key);
13267 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013268 if (!newkey)
13269 goto err;
13270 res = PyDict_SetItem(new, newkey, value);
13271 Py_DECREF(newkey);
13272 if (res < 0)
13273 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013274 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013275 /* just keep integer keys */
13276 if (PyDict_SetItem(new, key, value) < 0)
13277 goto err;
13278 } else {
13279 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13280 "be strings or integers");
13281 goto err;
13282 }
13283 }
13284 }
13285 return new;
13286 err:
13287 Py_DECREF(new);
13288 return NULL;
13289}
13290
INADA Naoki3ae20562017-01-16 20:41:20 +090013291/*[clinic input]
13292str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013293
INADA Naoki3ae20562017-01-16 20:41:20 +090013294 table: object
13295 Translation table, which must be a mapping of Unicode ordinals to
13296 Unicode ordinals, strings, or None.
13297 /
13298
13299Replace each character in the string using the given translation table.
13300
13301The table must implement lookup/indexing via __getitem__, for instance a
13302dictionary or list. If this operation raises LookupError, the character is
13303left untouched. Characters mapped to None are deleted.
13304[clinic start generated code]*/
13305
13306static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013307unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013308/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013309{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013310 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013311}
13312
INADA Naoki3ae20562017-01-16 20:41:20 +090013313/*[clinic input]
13314str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013315
INADA Naoki3ae20562017-01-16 20:41:20 +090013316Return a copy of the string converted to uppercase.
13317[clinic start generated code]*/
13318
13319static PyObject *
13320unicode_upper_impl(PyObject *self)
13321/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013322{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013323 if (PyUnicode_READY(self) == -1)
13324 return NULL;
13325 if (PyUnicode_IS_ASCII(self))
13326 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013327 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013328}
13329
INADA Naoki3ae20562017-01-16 20:41:20 +090013330/*[clinic input]
13331str.zfill as unicode_zfill
13332
13333 width: Py_ssize_t
13334 /
13335
13336Pad a numeric string with zeros on the left, to fill a field of the given width.
13337
13338The string is never truncated.
13339[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013340
13341static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013342unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013343/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013344{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013345 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013346 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013347 int kind;
13348 void *data;
13349 Py_UCS4 chr;
13350
Benjamin Petersonbac79492012-01-14 13:34:47 -050013351 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013352 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013353
Victor Stinnerc4b49542011-12-11 22:44:26 +010013354 if (PyUnicode_GET_LENGTH(self) >= width)
13355 return unicode_result_unchanged(self);
13356
13357 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013358
13359 u = pad(self, fill, 0, '0');
13360
Walter Dörwald068325e2002-04-15 13:36:47 +000013361 if (u == NULL)
13362 return NULL;
13363
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013364 kind = PyUnicode_KIND(u);
13365 data = PyUnicode_DATA(u);
13366 chr = PyUnicode_READ(kind, data, fill);
13367
13368 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013369 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013370 PyUnicode_WRITE(kind, data, 0, chr);
13371 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013372 }
13373
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013374 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013375 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013376}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013377
13378#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013379static PyObject *
13380unicode__decimal2ascii(PyObject *self)
13381{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013382 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013383}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013384#endif
13385
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013386PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013387 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013388\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013389Return True if S starts with the specified prefix, False otherwise.\n\
13390With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013391With optional end, stop comparing S at that position.\n\
13392prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013393
13394static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013395unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013396 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013397{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013398 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013399 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013400 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013401 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013402 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013403
Jesus Ceaac451502011-04-20 17:09:23 +020013404 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013405 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013406 if (PyTuple_Check(subobj)) {
13407 Py_ssize_t i;
13408 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013409 substring = PyTuple_GET_ITEM(subobj, i);
13410 if (!PyUnicode_Check(substring)) {
13411 PyErr_Format(PyExc_TypeError,
13412 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013413 "not %.100s",
13414 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013415 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013416 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013417 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013418 if (result == -1)
13419 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013420 if (result) {
13421 Py_RETURN_TRUE;
13422 }
13423 }
13424 /* nothing matched */
13425 Py_RETURN_FALSE;
13426 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013427 if (!PyUnicode_Check(subobj)) {
13428 PyErr_Format(PyExc_TypeError,
13429 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013430 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013431 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013432 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013433 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013434 if (result == -1)
13435 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013436 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013437}
13438
13439
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013440PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013441 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013442\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013443Return True if S ends with the specified suffix, False otherwise.\n\
13444With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013445With optional end, stop comparing S at that position.\n\
13446suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013447
13448static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013449unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013450 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013451{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013452 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013453 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013454 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013455 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013456 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013457
Jesus Ceaac451502011-04-20 17:09:23 +020013458 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013459 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013460 if (PyTuple_Check(subobj)) {
13461 Py_ssize_t i;
13462 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013463 substring = PyTuple_GET_ITEM(subobj, i);
13464 if (!PyUnicode_Check(substring)) {
13465 PyErr_Format(PyExc_TypeError,
13466 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013467 "not %.100s",
13468 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013469 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013470 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013471 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013472 if (result == -1)
13473 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013474 if (result) {
13475 Py_RETURN_TRUE;
13476 }
13477 }
13478 Py_RETURN_FALSE;
13479 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013480 if (!PyUnicode_Check(subobj)) {
13481 PyErr_Format(PyExc_TypeError,
13482 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013483 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013484 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013485 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013486 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013487 if (result == -1)
13488 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013489 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013490}
13491
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013492static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013493_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013494{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013495 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13496 writer->data = PyUnicode_DATA(writer->buffer);
13497
13498 if (!writer->readonly) {
13499 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013500 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013501 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013502 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013503 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13504 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13505 writer->kind = PyUnicode_WCHAR_KIND;
13506 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13507
Victor Stinner8f674cc2013-04-17 23:02:17 +020013508 /* Copy-on-write mode: set buffer size to 0 so
13509 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13510 * next write. */
13511 writer->size = 0;
13512 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013513}
13514
Victor Stinnerd3f08822012-05-29 12:57:52 +020013515void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013516_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013517{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013518 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013519
13520 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013521 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013522
13523 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13524 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13525 writer->kind = PyUnicode_WCHAR_KIND;
13526 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013527}
13528
Victor Stinnerd3f08822012-05-29 12:57:52 +020013529int
13530_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13531 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013532{
13533 Py_ssize_t newlen;
13534 PyObject *newbuffer;
13535
Victor Stinner2740e462016-09-06 16:58:36 -070013536 assert(maxchar <= MAX_UNICODE);
13537
Victor Stinnerca9381e2015-09-22 00:58:32 +020013538 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013539 assert((maxchar > writer->maxchar && length >= 0)
13540 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013541
Victor Stinner202fdca2012-05-07 12:47:02 +020013542 if (length > PY_SSIZE_T_MAX - writer->pos) {
13543 PyErr_NoMemory();
13544 return -1;
13545 }
13546 newlen = writer->pos + length;
13547
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013548 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013549
Victor Stinnerd3f08822012-05-29 12:57:52 +020013550 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013551 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013552 if (writer->overallocate
13553 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13554 /* overallocate to limit the number of realloc() */
13555 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013556 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013557 if (newlen < writer->min_length)
13558 newlen = writer->min_length;
13559
Victor Stinnerd3f08822012-05-29 12:57:52 +020013560 writer->buffer = PyUnicode_New(newlen, maxchar);
13561 if (writer->buffer == NULL)
13562 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013563 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013564 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013565 if (writer->overallocate
13566 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13567 /* overallocate to limit the number of realloc() */
13568 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013569 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013570 if (newlen < writer->min_length)
13571 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013572
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013573 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013574 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013575 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013576 newbuffer = PyUnicode_New(newlen, maxchar);
13577 if (newbuffer == NULL)
13578 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013579 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13580 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013581 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013582 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013583 }
13584 else {
13585 newbuffer = resize_compact(writer->buffer, newlen);
13586 if (newbuffer == NULL)
13587 return -1;
13588 }
13589 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013590 }
13591 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013592 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013593 newbuffer = PyUnicode_New(writer->size, maxchar);
13594 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013595 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013596 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13597 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013598 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013599 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013600 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013601 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013602
13603#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013604}
13605
Victor Stinnerca9381e2015-09-22 00:58:32 +020013606int
13607_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13608 enum PyUnicode_Kind kind)
13609{
13610 Py_UCS4 maxchar;
13611
13612 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13613 assert(writer->kind < kind);
13614
13615 switch (kind)
13616 {
13617 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13618 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13619 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13620 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013621 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013622 }
13623
13624 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13625}
13626
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013627static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013628_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013629{
Victor Stinner2740e462016-09-06 16:58:36 -070013630 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013631 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13632 return -1;
13633 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13634 writer->pos++;
13635 return 0;
13636}
13637
13638int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013639_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13640{
13641 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13642}
13643
13644int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013645_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13646{
13647 Py_UCS4 maxchar;
13648 Py_ssize_t len;
13649
13650 if (PyUnicode_READY(str) == -1)
13651 return -1;
13652 len = PyUnicode_GET_LENGTH(str);
13653 if (len == 0)
13654 return 0;
13655 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13656 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013657 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013658 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013659 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013660 Py_INCREF(str);
13661 writer->buffer = str;
13662 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013663 writer->pos += len;
13664 return 0;
13665 }
13666 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13667 return -1;
13668 }
13669 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13670 str, 0, len);
13671 writer->pos += len;
13672 return 0;
13673}
13674
Victor Stinnere215d962012-10-06 23:03:36 +020013675int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013676_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13677 Py_ssize_t start, Py_ssize_t end)
13678{
13679 Py_UCS4 maxchar;
13680 Py_ssize_t len;
13681
13682 if (PyUnicode_READY(str) == -1)
13683 return -1;
13684
13685 assert(0 <= start);
13686 assert(end <= PyUnicode_GET_LENGTH(str));
13687 assert(start <= end);
13688
13689 if (end == 0)
13690 return 0;
13691
13692 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13693 return _PyUnicodeWriter_WriteStr(writer, str);
13694
13695 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13696 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13697 else
13698 maxchar = writer->maxchar;
13699 len = end - start;
13700
13701 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13702 return -1;
13703
13704 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13705 str, start, len);
13706 writer->pos += len;
13707 return 0;
13708}
13709
13710int
Victor Stinner4a587072013-11-19 12:54:53 +010013711_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13712 const char *ascii, Py_ssize_t len)
13713{
13714 if (len == -1)
13715 len = strlen(ascii);
13716
13717 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13718
13719 if (writer->buffer == NULL && !writer->overallocate) {
13720 PyObject *str;
13721
13722 str = _PyUnicode_FromASCII(ascii, len);
13723 if (str == NULL)
13724 return -1;
13725
13726 writer->readonly = 1;
13727 writer->buffer = str;
13728 _PyUnicodeWriter_Update(writer);
13729 writer->pos += len;
13730 return 0;
13731 }
13732
13733 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13734 return -1;
13735
13736 switch (writer->kind)
13737 {
13738 case PyUnicode_1BYTE_KIND:
13739 {
13740 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13741 Py_UCS1 *data = writer->data;
13742
Christian Heimesf051e432016-09-13 20:22:02 +020013743 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013744 break;
13745 }
13746 case PyUnicode_2BYTE_KIND:
13747 {
13748 _PyUnicode_CONVERT_BYTES(
13749 Py_UCS1, Py_UCS2,
13750 ascii, ascii + len,
13751 (Py_UCS2 *)writer->data + writer->pos);
13752 break;
13753 }
13754 case PyUnicode_4BYTE_KIND:
13755 {
13756 _PyUnicode_CONVERT_BYTES(
13757 Py_UCS1, Py_UCS4,
13758 ascii, ascii + len,
13759 (Py_UCS4 *)writer->data + writer->pos);
13760 break;
13761 }
13762 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013763 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013764 }
13765
13766 writer->pos += len;
13767 return 0;
13768}
13769
13770int
13771_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13772 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013773{
13774 Py_UCS4 maxchar;
13775
13776 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13777 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13778 return -1;
13779 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13780 writer->pos += len;
13781 return 0;
13782}
13783
Victor Stinnerd3f08822012-05-29 12:57:52 +020013784PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013785_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013786{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013787 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013788
Victor Stinnerd3f08822012-05-29 12:57:52 +020013789 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013790 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013791 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013792 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013793
13794 str = writer->buffer;
13795 writer->buffer = NULL;
13796
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013797 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013798 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13799 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013800 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013801
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013802 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13803 PyObject *str2;
13804 str2 = resize_compact(str, writer->pos);
13805 if (str2 == NULL) {
13806 Py_DECREF(str);
13807 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013808 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013809 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013810 }
13811
Victor Stinner15a0bd32013-07-08 22:29:55 +020013812 assert(_PyUnicode_CheckConsistency(str, 1));
13813 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013814}
13815
Victor Stinnerd3f08822012-05-29 12:57:52 +020013816void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013817_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013818{
13819 Py_CLEAR(writer->buffer);
13820}
13821
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013822#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013823
13824PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013825 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013826\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013827Return a formatted version of S, using substitutions from args and kwargs.\n\
13828The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013829
Eric Smith27bbca62010-11-04 17:06:58 +000013830PyDoc_STRVAR(format_map__doc__,
13831 "S.format_map(mapping) -> str\n\
13832\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013833Return a formatted version of S, using substitutions from mapping.\n\
13834The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013835
INADA Naoki3ae20562017-01-16 20:41:20 +090013836/*[clinic input]
13837str.__format__ as unicode___format__
13838
13839 format_spec: unicode
13840 /
13841
13842Return a formatted version of the string as described by format_spec.
13843[clinic start generated code]*/
13844
Eric Smith4a7d76d2008-05-30 18:10:19 +000013845static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013846unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013847/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013848{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013849 _PyUnicodeWriter writer;
13850 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013851
Victor Stinnerd3f08822012-05-29 12:57:52 +020013852 if (PyUnicode_READY(self) == -1)
13853 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013854 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013855 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13856 self, format_spec, 0,
13857 PyUnicode_GET_LENGTH(format_spec));
13858 if (ret == -1) {
13859 _PyUnicodeWriter_Dealloc(&writer);
13860 return NULL;
13861 }
13862 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013863}
13864
INADA Naoki3ae20562017-01-16 20:41:20 +090013865/*[clinic input]
13866str.__sizeof__ as unicode_sizeof
13867
13868Return the size of the string in memory, in bytes.
13869[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013870
13871static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013872unicode_sizeof_impl(PyObject *self)
13873/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013874{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013875 Py_ssize_t size;
13876
13877 /* If it's a compact object, account for base structure +
13878 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013879 if (PyUnicode_IS_COMPACT_ASCII(self))
13880 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13881 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013882 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013883 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013884 else {
13885 /* If it is a two-block object, account for base object, and
13886 for character block if present. */
13887 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013888 if (_PyUnicode_DATA_ANY(self))
13889 size += (PyUnicode_GET_LENGTH(self) + 1) *
13890 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013891 }
13892 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013893 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013894 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13895 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13896 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13897 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013898
13899 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013900}
13901
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013902static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013903unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013904{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013905 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013906 if (!copy)
13907 return NULL;
13908 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013909}
13910
Guido van Rossumd57fd912000-03-10 22:53:23 +000013911static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013912 UNICODE_ENCODE_METHODDEF
13913 UNICODE_REPLACE_METHODDEF
13914 UNICODE_SPLIT_METHODDEF
13915 UNICODE_RSPLIT_METHODDEF
13916 UNICODE_JOIN_METHODDEF
13917 UNICODE_CAPITALIZE_METHODDEF
13918 UNICODE_CASEFOLD_METHODDEF
13919 UNICODE_TITLE_METHODDEF
13920 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013921 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013922 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013923 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013924 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013925 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013926 UNICODE_LJUST_METHODDEF
13927 UNICODE_LOWER_METHODDEF
13928 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013929 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13930 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013931 UNICODE_RJUST_METHODDEF
13932 UNICODE_RSTRIP_METHODDEF
13933 UNICODE_RPARTITION_METHODDEF
13934 UNICODE_SPLITLINES_METHODDEF
13935 UNICODE_STRIP_METHODDEF
13936 UNICODE_SWAPCASE_METHODDEF
13937 UNICODE_TRANSLATE_METHODDEF
13938 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013939 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13940 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090013941 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013942 UNICODE_ISLOWER_METHODDEF
13943 UNICODE_ISUPPER_METHODDEF
13944 UNICODE_ISTITLE_METHODDEF
13945 UNICODE_ISSPACE_METHODDEF
13946 UNICODE_ISDECIMAL_METHODDEF
13947 UNICODE_ISDIGIT_METHODDEF
13948 UNICODE_ISNUMERIC_METHODDEF
13949 UNICODE_ISALPHA_METHODDEF
13950 UNICODE_ISALNUM_METHODDEF
13951 UNICODE_ISIDENTIFIER_METHODDEF
13952 UNICODE_ISPRINTABLE_METHODDEF
13953 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020013954 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013955 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013956 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013957 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013958 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013959#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013960 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013961 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013962#endif
13963
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013964 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013965 {NULL, NULL}
13966};
13967
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013968static PyObject *
13969unicode_mod(PyObject *v, PyObject *w)
13970{
Brian Curtindfc80e32011-08-10 20:28:54 -050013971 if (!PyUnicode_Check(v))
13972 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013973 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013974}
13975
13976static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013977 0, /*nb_add*/
13978 0, /*nb_subtract*/
13979 0, /*nb_multiply*/
13980 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013981};
13982
Guido van Rossumd57fd912000-03-10 22:53:23 +000013983static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013984 (lenfunc) unicode_length, /* sq_length */
13985 PyUnicode_Concat, /* sq_concat */
13986 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13987 (ssizeargfunc) unicode_getitem, /* sq_item */
13988 0, /* sq_slice */
13989 0, /* sq_ass_item */
13990 0, /* sq_ass_slice */
13991 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013992};
13993
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013994static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013995unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013996{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013997 if (PyUnicode_READY(self) == -1)
13998 return NULL;
13999
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014000 if (PyIndex_Check(item)) {
14001 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014002 if (i == -1 && PyErr_Occurred())
14003 return NULL;
14004 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014005 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014006 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014007 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000014008 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014009 PyObject *result;
14010 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014011 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014012 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014013
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014014 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014015 return NULL;
14016 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014017 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14018 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014019
14020 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014021 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014022 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014023 slicelength == PyUnicode_GET_LENGTH(self)) {
14024 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014025 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014026 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014027 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014028 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014029 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014030 src_kind = PyUnicode_KIND(self);
14031 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014032 if (!PyUnicode_IS_ASCII(self)) {
14033 kind_limit = kind_maxchar_limit(src_kind);
14034 max_char = 0;
14035 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14036 ch = PyUnicode_READ(src_kind, src_data, cur);
14037 if (ch > max_char) {
14038 max_char = ch;
14039 if (max_char >= kind_limit)
14040 break;
14041 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014042 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014043 }
Victor Stinner55c99112011-10-13 01:17:06 +020014044 else
14045 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014046 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014047 if (result == NULL)
14048 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014049 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014050 dest_data = PyUnicode_DATA(result);
14051
14052 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014053 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14054 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014055 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014056 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014057 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014058 } else {
14059 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14060 return NULL;
14061 }
14062}
14063
14064static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014065 (lenfunc)unicode_length, /* mp_length */
14066 (binaryfunc)unicode_subscript, /* mp_subscript */
14067 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014068};
14069
Guido van Rossumd57fd912000-03-10 22:53:23 +000014070
Guido van Rossumd57fd912000-03-10 22:53:23 +000014071/* Helpers for PyUnicode_Format() */
14072
Victor Stinnera47082312012-10-04 02:19:54 +020014073struct unicode_formatter_t {
14074 PyObject *args;
14075 int args_owned;
14076 Py_ssize_t arglen, argidx;
14077 PyObject *dict;
14078
14079 enum PyUnicode_Kind fmtkind;
14080 Py_ssize_t fmtcnt, fmtpos;
14081 void *fmtdata;
14082 PyObject *fmtstr;
14083
14084 _PyUnicodeWriter writer;
14085};
14086
14087struct unicode_format_arg_t {
14088 Py_UCS4 ch;
14089 int flags;
14090 Py_ssize_t width;
14091 int prec;
14092 int sign;
14093};
14094
Guido van Rossumd57fd912000-03-10 22:53:23 +000014095static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014096unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014097{
Victor Stinnera47082312012-10-04 02:19:54 +020014098 Py_ssize_t argidx = ctx->argidx;
14099
14100 if (argidx < ctx->arglen) {
14101 ctx->argidx++;
14102 if (ctx->arglen < 0)
14103 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014104 else
Victor Stinnera47082312012-10-04 02:19:54 +020014105 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014106 }
14107 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014108 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014109 return NULL;
14110}
14111
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014112/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014113
Victor Stinnera47082312012-10-04 02:19:54 +020014114/* Format a float into the writer if the writer is not NULL, or into *p_output
14115 otherwise.
14116
14117 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014118static int
Victor Stinnera47082312012-10-04 02:19:54 +020014119formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14120 PyObject **p_output,
14121 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014122{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014123 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014124 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014125 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014126 int prec;
14127 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014128
Guido van Rossumd57fd912000-03-10 22:53:23 +000014129 x = PyFloat_AsDouble(v);
14130 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014131 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014132
Victor Stinnera47082312012-10-04 02:19:54 +020014133 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014134 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014135 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014136
Victor Stinnera47082312012-10-04 02:19:54 +020014137 if (arg->flags & F_ALT)
14138 dtoa_flags = Py_DTSF_ALT;
14139 else
14140 dtoa_flags = 0;
14141 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014142 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014143 return -1;
14144 len = strlen(p);
14145 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014146 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014147 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014148 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014149 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014150 }
14151 else
14152 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014153 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014154 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014155}
14156
Victor Stinnerd0880d52012-04-27 23:40:13 +020014157/* formatlong() emulates the format codes d, u, o, x and X, and
14158 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14159 * Python's regular ints.
14160 * Return value: a new PyUnicodeObject*, or NULL if error.
14161 * The output string is of the form
14162 * "-"? ("0x" | "0X")? digit+
14163 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14164 * set in flags. The case of hex digits will be correct,
14165 * There will be at least prec digits, zero-filled on the left if
14166 * necessary to get that many.
14167 * val object to be converted
14168 * flags bitmask of format flags; only F_ALT is looked at
14169 * prec minimum number of digits; 0-fill on left if needed
14170 * type a character in [duoxX]; u acts the same as d
14171 *
14172 * CAUTION: o, x and X conversions on regular ints can never
14173 * produce a '-' sign, but can for Python's unbounded ints.
14174 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014175PyObject *
14176_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014177{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014178 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014179 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014180 Py_ssize_t i;
14181 int sign; /* 1 if '-', else 0 */
14182 int len; /* number of characters */
14183 Py_ssize_t llen;
14184 int numdigits; /* len == numnondigits + numdigits */
14185 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014186
Victor Stinnerd0880d52012-04-27 23:40:13 +020014187 /* Avoid exceeding SSIZE_T_MAX */
14188 if (prec > INT_MAX-3) {
14189 PyErr_SetString(PyExc_OverflowError,
14190 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014191 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014192 }
14193
14194 assert(PyLong_Check(val));
14195
14196 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014197 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014198 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014199 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014200 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014201 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014202 /* int and int subclasses should print numerically when a numeric */
14203 /* format code is used (see issue18780) */
14204 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014205 break;
14206 case 'o':
14207 numnondigits = 2;
14208 result = PyNumber_ToBase(val, 8);
14209 break;
14210 case 'x':
14211 case 'X':
14212 numnondigits = 2;
14213 result = PyNumber_ToBase(val, 16);
14214 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014215 }
14216 if (!result)
14217 return NULL;
14218
14219 assert(unicode_modifiable(result));
14220 assert(PyUnicode_IS_READY(result));
14221 assert(PyUnicode_IS_ASCII(result));
14222
14223 /* To modify the string in-place, there can only be one reference. */
14224 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014225 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014226 PyErr_BadInternalCall();
14227 return NULL;
14228 }
14229 buf = PyUnicode_DATA(result);
14230 llen = PyUnicode_GET_LENGTH(result);
14231 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014232 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014233 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014234 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014235 return NULL;
14236 }
14237 len = (int)llen;
14238 sign = buf[0] == '-';
14239 numnondigits += sign;
14240 numdigits = len - numnondigits;
14241 assert(numdigits > 0);
14242
14243 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014244 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014245 (type == 'o' || type == 'x' || type == 'X'))) {
14246 assert(buf[sign] == '0');
14247 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14248 buf[sign+1] == 'o');
14249 numnondigits -= 2;
14250 buf += 2;
14251 len -= 2;
14252 if (sign)
14253 buf[0] = '-';
14254 assert(len == numnondigits + numdigits);
14255 assert(numdigits > 0);
14256 }
14257
14258 /* Fill with leading zeroes to meet minimum width. */
14259 if (prec > numdigits) {
14260 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14261 numnondigits + prec);
14262 char *b1;
14263 if (!r1) {
14264 Py_DECREF(result);
14265 return NULL;
14266 }
14267 b1 = PyBytes_AS_STRING(r1);
14268 for (i = 0; i < numnondigits; ++i)
14269 *b1++ = *buf++;
14270 for (i = 0; i < prec - numdigits; i++)
14271 *b1++ = '0';
14272 for (i = 0; i < numdigits; i++)
14273 *b1++ = *buf++;
14274 *b1 = '\0';
14275 Py_DECREF(result);
14276 result = r1;
14277 buf = PyBytes_AS_STRING(result);
14278 len = numnondigits + prec;
14279 }
14280
14281 /* Fix up case for hex conversions. */
14282 if (type == 'X') {
14283 /* Need to convert all lower case letters to upper case.
14284 and need to convert 0x to 0X (and -0x to -0X). */
14285 for (i = 0; i < len; i++)
14286 if (buf[i] >= 'a' && buf[i] <= 'x')
14287 buf[i] -= 'a'-'A';
14288 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014289 if (!PyUnicode_Check(result)
14290 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014291 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014292 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014293 Py_DECREF(result);
14294 result = unicode;
14295 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014296 else if (len != PyUnicode_GET_LENGTH(result)) {
14297 if (PyUnicode_Resize(&result, len) < 0)
14298 Py_CLEAR(result);
14299 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014300 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014301}
14302
Ethan Furmandf3ed242014-01-05 06:50:30 -080014303/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014304 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014305 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014306 * -1 and raise an exception on error */
14307static int
Victor Stinnera47082312012-10-04 02:19:54 +020014308mainformatlong(PyObject *v,
14309 struct unicode_format_arg_t *arg,
14310 PyObject **p_output,
14311 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014312{
14313 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014314 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014315
14316 if (!PyNumber_Check(v))
14317 goto wrongtype;
14318
Ethan Furman9ab74802014-03-21 06:38:46 -070014319 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014320 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014321 if (type == 'o' || type == 'x' || type == 'X') {
14322 iobj = PyNumber_Index(v);
14323 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014324 if (PyErr_ExceptionMatches(PyExc_TypeError))
14325 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014326 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014327 }
14328 }
14329 else {
14330 iobj = PyNumber_Long(v);
14331 if (iobj == NULL ) {
14332 if (PyErr_ExceptionMatches(PyExc_TypeError))
14333 goto wrongtype;
14334 return -1;
14335 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014336 }
14337 assert(PyLong_Check(iobj));
14338 }
14339 else {
14340 iobj = v;
14341 Py_INCREF(iobj);
14342 }
14343
14344 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014345 && arg->width == -1 && arg->prec == -1
14346 && !(arg->flags & (F_SIGN | F_BLANK))
14347 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014348 {
14349 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014350 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014351 int base;
14352
Victor Stinnera47082312012-10-04 02:19:54 +020014353 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014354 {
14355 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014356 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014357 case 'd':
14358 case 'i':
14359 case 'u':
14360 base = 10;
14361 break;
14362 case 'o':
14363 base = 8;
14364 break;
14365 case 'x':
14366 case 'X':
14367 base = 16;
14368 break;
14369 }
14370
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014371 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14372 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014373 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014374 }
14375 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014376 return 1;
14377 }
14378
Ethan Furmanb95b5612015-01-23 20:05:18 -080014379 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014380 Py_DECREF(iobj);
14381 if (res == NULL)
14382 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014383 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014384 return 0;
14385
14386wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014387 switch(type)
14388 {
14389 case 'o':
14390 case 'x':
14391 case 'X':
14392 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014393 "%%%c format: an integer is required, "
14394 "not %.200s",
14395 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014396 break;
14397 default:
14398 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014399 "%%%c format: a number is required, "
14400 "not %.200s",
14401 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014402 break;
14403 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014404 return -1;
14405}
14406
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014407static Py_UCS4
14408formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014409{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014410 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014411 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014412 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014413 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014414 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014415 goto onError;
14416 }
14417 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014418 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014419 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014420 /* make sure number is a type of integer */
14421 if (!PyLong_Check(v)) {
14422 iobj = PyNumber_Index(v);
14423 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014424 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014425 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014426 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014427 Py_DECREF(iobj);
14428 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014429 else {
14430 x = PyLong_AsLong(v);
14431 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014432 if (x == -1 && PyErr_Occurred())
14433 goto onError;
14434
Victor Stinner8faf8212011-12-08 22:14:11 +010014435 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014436 PyErr_SetString(PyExc_OverflowError,
14437 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014438 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014439 }
14440
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014441 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014442 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014443
Benjamin Peterson29060642009-01-31 22:14:21 +000014444 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014445 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014446 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014447 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014448}
14449
Victor Stinnera47082312012-10-04 02:19:54 +020014450/* Parse options of an argument: flags, width, precision.
14451 Handle also "%(name)" syntax.
14452
14453 Return 0 if the argument has been formatted into arg->str.
14454 Return 1 if the argument has been written into ctx->writer,
14455 Raise an exception and return -1 on error. */
14456static int
14457unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14458 struct unicode_format_arg_t *arg)
14459{
14460#define FORMAT_READ(ctx) \
14461 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14462
14463 PyObject *v;
14464
Victor Stinnera47082312012-10-04 02:19:54 +020014465 if (arg->ch == '(') {
14466 /* Get argument value from a dictionary. Example: "%(name)s". */
14467 Py_ssize_t keystart;
14468 Py_ssize_t keylen;
14469 PyObject *key;
14470 int pcount = 1;
14471
14472 if (ctx->dict == NULL) {
14473 PyErr_SetString(PyExc_TypeError,
14474 "format requires a mapping");
14475 return -1;
14476 }
14477 ++ctx->fmtpos;
14478 --ctx->fmtcnt;
14479 keystart = ctx->fmtpos;
14480 /* Skip over balanced parentheses */
14481 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14482 arg->ch = FORMAT_READ(ctx);
14483 if (arg->ch == ')')
14484 --pcount;
14485 else if (arg->ch == '(')
14486 ++pcount;
14487 ctx->fmtpos++;
14488 }
14489 keylen = ctx->fmtpos - keystart - 1;
14490 if (ctx->fmtcnt < 0 || pcount > 0) {
14491 PyErr_SetString(PyExc_ValueError,
14492 "incomplete format key");
14493 return -1;
14494 }
14495 key = PyUnicode_Substring(ctx->fmtstr,
14496 keystart, keystart + keylen);
14497 if (key == NULL)
14498 return -1;
14499 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014500 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014501 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014502 }
14503 ctx->args = PyObject_GetItem(ctx->dict, key);
14504 Py_DECREF(key);
14505 if (ctx->args == NULL)
14506 return -1;
14507 ctx->args_owned = 1;
14508 ctx->arglen = -1;
14509 ctx->argidx = -2;
14510 }
14511
14512 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014513 while (--ctx->fmtcnt >= 0) {
14514 arg->ch = FORMAT_READ(ctx);
14515 ctx->fmtpos++;
14516 switch (arg->ch) {
14517 case '-': arg->flags |= F_LJUST; continue;
14518 case '+': arg->flags |= F_SIGN; continue;
14519 case ' ': arg->flags |= F_BLANK; continue;
14520 case '#': arg->flags |= F_ALT; continue;
14521 case '0': arg->flags |= F_ZERO; continue;
14522 }
14523 break;
14524 }
14525
14526 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014527 if (arg->ch == '*') {
14528 v = unicode_format_getnextarg(ctx);
14529 if (v == NULL)
14530 return -1;
14531 if (!PyLong_Check(v)) {
14532 PyErr_SetString(PyExc_TypeError,
14533 "* wants int");
14534 return -1;
14535 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014536 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014537 if (arg->width == -1 && PyErr_Occurred())
14538 return -1;
14539 if (arg->width < 0) {
14540 arg->flags |= F_LJUST;
14541 arg->width = -arg->width;
14542 }
14543 if (--ctx->fmtcnt >= 0) {
14544 arg->ch = FORMAT_READ(ctx);
14545 ctx->fmtpos++;
14546 }
14547 }
14548 else if (arg->ch >= '0' && arg->ch <= '9') {
14549 arg->width = arg->ch - '0';
14550 while (--ctx->fmtcnt >= 0) {
14551 arg->ch = FORMAT_READ(ctx);
14552 ctx->fmtpos++;
14553 if (arg->ch < '0' || arg->ch > '9')
14554 break;
14555 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14556 mixing signed and unsigned comparison. Since arg->ch is between
14557 '0' and '9', casting to int is safe. */
14558 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14559 PyErr_SetString(PyExc_ValueError,
14560 "width too big");
14561 return -1;
14562 }
14563 arg->width = arg->width*10 + (arg->ch - '0');
14564 }
14565 }
14566
14567 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014568 if (arg->ch == '.') {
14569 arg->prec = 0;
14570 if (--ctx->fmtcnt >= 0) {
14571 arg->ch = FORMAT_READ(ctx);
14572 ctx->fmtpos++;
14573 }
14574 if (arg->ch == '*') {
14575 v = unicode_format_getnextarg(ctx);
14576 if (v == NULL)
14577 return -1;
14578 if (!PyLong_Check(v)) {
14579 PyErr_SetString(PyExc_TypeError,
14580 "* wants int");
14581 return -1;
14582 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014583 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014584 if (arg->prec == -1 && PyErr_Occurred())
14585 return -1;
14586 if (arg->prec < 0)
14587 arg->prec = 0;
14588 if (--ctx->fmtcnt >= 0) {
14589 arg->ch = FORMAT_READ(ctx);
14590 ctx->fmtpos++;
14591 }
14592 }
14593 else if (arg->ch >= '0' && arg->ch <= '9') {
14594 arg->prec = arg->ch - '0';
14595 while (--ctx->fmtcnt >= 0) {
14596 arg->ch = FORMAT_READ(ctx);
14597 ctx->fmtpos++;
14598 if (arg->ch < '0' || arg->ch > '9')
14599 break;
14600 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14601 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014602 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014603 return -1;
14604 }
14605 arg->prec = arg->prec*10 + (arg->ch - '0');
14606 }
14607 }
14608 }
14609
14610 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14611 if (ctx->fmtcnt >= 0) {
14612 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14613 if (--ctx->fmtcnt >= 0) {
14614 arg->ch = FORMAT_READ(ctx);
14615 ctx->fmtpos++;
14616 }
14617 }
14618 }
14619 if (ctx->fmtcnt < 0) {
14620 PyErr_SetString(PyExc_ValueError,
14621 "incomplete format");
14622 return -1;
14623 }
14624 return 0;
14625
14626#undef FORMAT_READ
14627}
14628
14629/* Format one argument. Supported conversion specifiers:
14630
14631 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014632 - "i", "d", "u": int or float
14633 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014634 - "e", "E", "f", "F", "g", "G": float
14635 - "c": int or str (1 character)
14636
Victor Stinner8dbd4212012-12-04 09:30:24 +010014637 When possible, the output is written directly into the Unicode writer
14638 (ctx->writer). A string is created when padding is required.
14639
Victor Stinnera47082312012-10-04 02:19:54 +020014640 Return 0 if the argument has been formatted into *p_str,
14641 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014642 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014643static int
14644unicode_format_arg_format(struct unicode_formatter_t *ctx,
14645 struct unicode_format_arg_t *arg,
14646 PyObject **p_str)
14647{
14648 PyObject *v;
14649 _PyUnicodeWriter *writer = &ctx->writer;
14650
14651 if (ctx->fmtcnt == 0)
14652 ctx->writer.overallocate = 0;
14653
Victor Stinnera47082312012-10-04 02:19:54 +020014654 v = unicode_format_getnextarg(ctx);
14655 if (v == NULL)
14656 return -1;
14657
Victor Stinnera47082312012-10-04 02:19:54 +020014658
14659 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014660 case 's':
14661 case 'r':
14662 case 'a':
14663 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14664 /* Fast path */
14665 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14666 return -1;
14667 return 1;
14668 }
14669
14670 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14671 *p_str = v;
14672 Py_INCREF(*p_str);
14673 }
14674 else {
14675 if (arg->ch == 's')
14676 *p_str = PyObject_Str(v);
14677 else if (arg->ch == 'r')
14678 *p_str = PyObject_Repr(v);
14679 else
14680 *p_str = PyObject_ASCII(v);
14681 }
14682 break;
14683
14684 case 'i':
14685 case 'd':
14686 case 'u':
14687 case 'o':
14688 case 'x':
14689 case 'X':
14690 {
14691 int ret = mainformatlong(v, arg, p_str, writer);
14692 if (ret != 0)
14693 return ret;
14694 arg->sign = 1;
14695 break;
14696 }
14697
14698 case 'e':
14699 case 'E':
14700 case 'f':
14701 case 'F':
14702 case 'g':
14703 case 'G':
14704 if (arg->width == -1 && arg->prec == -1
14705 && !(arg->flags & (F_SIGN | F_BLANK)))
14706 {
14707 /* Fast path */
14708 if (formatfloat(v, arg, NULL, writer) == -1)
14709 return -1;
14710 return 1;
14711 }
14712
14713 arg->sign = 1;
14714 if (formatfloat(v, arg, p_str, NULL) == -1)
14715 return -1;
14716 break;
14717
14718 case 'c':
14719 {
14720 Py_UCS4 ch = formatchar(v);
14721 if (ch == (Py_UCS4) -1)
14722 return -1;
14723 if (arg->width == -1 && arg->prec == -1) {
14724 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014725 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014726 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014727 return 1;
14728 }
14729 *p_str = PyUnicode_FromOrdinal(ch);
14730 break;
14731 }
14732
14733 default:
14734 PyErr_Format(PyExc_ValueError,
14735 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014736 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014737 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14738 (int)arg->ch,
14739 ctx->fmtpos - 1);
14740 return -1;
14741 }
14742 if (*p_str == NULL)
14743 return -1;
14744 assert (PyUnicode_Check(*p_str));
14745 return 0;
14746}
14747
14748static int
14749unicode_format_arg_output(struct unicode_formatter_t *ctx,
14750 struct unicode_format_arg_t *arg,
14751 PyObject *str)
14752{
14753 Py_ssize_t len;
14754 enum PyUnicode_Kind kind;
14755 void *pbuf;
14756 Py_ssize_t pindex;
14757 Py_UCS4 signchar;
14758 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014759 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014760 Py_ssize_t sublen;
14761 _PyUnicodeWriter *writer = &ctx->writer;
14762 Py_UCS4 fill;
14763
14764 fill = ' ';
14765 if (arg->sign && arg->flags & F_ZERO)
14766 fill = '0';
14767
14768 if (PyUnicode_READY(str) == -1)
14769 return -1;
14770
14771 len = PyUnicode_GET_LENGTH(str);
14772 if ((arg->width == -1 || arg->width <= len)
14773 && (arg->prec == -1 || arg->prec >= len)
14774 && !(arg->flags & (F_SIGN | F_BLANK)))
14775 {
14776 /* Fast path */
14777 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14778 return -1;
14779 return 0;
14780 }
14781
14782 /* Truncate the string for "s", "r" and "a" formats
14783 if the precision is set */
14784 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14785 if (arg->prec >= 0 && len > arg->prec)
14786 len = arg->prec;
14787 }
14788
14789 /* Adjust sign and width */
14790 kind = PyUnicode_KIND(str);
14791 pbuf = PyUnicode_DATA(str);
14792 pindex = 0;
14793 signchar = '\0';
14794 if (arg->sign) {
14795 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14796 if (ch == '-' || ch == '+') {
14797 signchar = ch;
14798 len--;
14799 pindex++;
14800 }
14801 else if (arg->flags & F_SIGN)
14802 signchar = '+';
14803 else if (arg->flags & F_BLANK)
14804 signchar = ' ';
14805 else
14806 arg->sign = 0;
14807 }
14808 if (arg->width < len)
14809 arg->width = len;
14810
14811 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014812 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014813 if (!(arg->flags & F_LJUST)) {
14814 if (arg->sign) {
14815 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014816 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014817 }
14818 else {
14819 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014820 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014821 }
14822 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014823 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14824 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014825 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014826 }
14827
Victor Stinnera47082312012-10-04 02:19:54 +020014828 buflen = arg->width;
14829 if (arg->sign && len == arg->width)
14830 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014831 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014832 return -1;
14833
14834 /* Write the sign if needed */
14835 if (arg->sign) {
14836 if (fill != ' ') {
14837 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14838 writer->pos += 1;
14839 }
14840 if (arg->width > len)
14841 arg->width--;
14842 }
14843
14844 /* Write the numeric prefix for "x", "X" and "o" formats
14845 if the alternate form is used.
14846 For example, write "0x" for the "%#x" format. */
14847 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14848 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14849 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14850 if (fill != ' ') {
14851 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14852 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14853 writer->pos += 2;
14854 pindex += 2;
14855 }
14856 arg->width -= 2;
14857 if (arg->width < 0)
14858 arg->width = 0;
14859 len -= 2;
14860 }
14861
14862 /* Pad left with the fill character if needed */
14863 if (arg->width > len && !(arg->flags & F_LJUST)) {
14864 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014865 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014866 writer->pos += sublen;
14867 arg->width = len;
14868 }
14869
14870 /* If padding with spaces: write sign if needed and/or numeric prefix if
14871 the alternate form is used */
14872 if (fill == ' ') {
14873 if (arg->sign) {
14874 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14875 writer->pos += 1;
14876 }
14877 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14878 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14879 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14880 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14881 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14882 writer->pos += 2;
14883 pindex += 2;
14884 }
14885 }
14886
14887 /* Write characters */
14888 if (len) {
14889 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14890 str, pindex, len);
14891 writer->pos += len;
14892 }
14893
14894 /* Pad right with the fill character if needed */
14895 if (arg->width > len) {
14896 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014897 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014898 writer->pos += sublen;
14899 }
14900 return 0;
14901}
14902
14903/* Helper of PyUnicode_Format(): format one arg.
14904 Return 0 on success, raise an exception and return -1 on error. */
14905static int
14906unicode_format_arg(struct unicode_formatter_t *ctx)
14907{
14908 struct unicode_format_arg_t arg;
14909 PyObject *str;
14910 int ret;
14911
Victor Stinner8dbd4212012-12-04 09:30:24 +010014912 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014913 if (arg.ch == '%') {
14914 ctx->fmtpos++;
14915 ctx->fmtcnt--;
14916 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14917 return -1;
14918 return 0;
14919 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014920 arg.flags = 0;
14921 arg.width = -1;
14922 arg.prec = -1;
14923 arg.sign = 0;
14924 str = NULL;
14925
Victor Stinnera47082312012-10-04 02:19:54 +020014926 ret = unicode_format_arg_parse(ctx, &arg);
14927 if (ret == -1)
14928 return -1;
14929
14930 ret = unicode_format_arg_format(ctx, &arg, &str);
14931 if (ret == -1)
14932 return -1;
14933
14934 if (ret != 1) {
14935 ret = unicode_format_arg_output(ctx, &arg, str);
14936 Py_DECREF(str);
14937 if (ret == -1)
14938 return -1;
14939 }
14940
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014941 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014942 PyErr_SetString(PyExc_TypeError,
14943 "not all arguments converted during string formatting");
14944 return -1;
14945 }
14946 return 0;
14947}
14948
Alexander Belopolsky40018472011-02-26 01:02:56 +000014949PyObject *
14950PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014951{
Victor Stinnera47082312012-10-04 02:19:54 +020014952 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014953
Guido van Rossumd57fd912000-03-10 22:53:23 +000014954 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014955 PyErr_BadInternalCall();
14956 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014957 }
Victor Stinnera47082312012-10-04 02:19:54 +020014958
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014959 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014960 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014961
14962 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014963 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14964 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14965 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14966 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014967
Victor Stinner8f674cc2013-04-17 23:02:17 +020014968 _PyUnicodeWriter_Init(&ctx.writer);
14969 ctx.writer.min_length = ctx.fmtcnt + 100;
14970 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014971
Guido van Rossumd57fd912000-03-10 22:53:23 +000014972 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014973 ctx.arglen = PyTuple_Size(args);
14974 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014975 }
14976 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014977 ctx.arglen = -1;
14978 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014979 }
Victor Stinnera47082312012-10-04 02:19:54 +020014980 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014981 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014982 ctx.dict = args;
14983 else
14984 ctx.dict = NULL;
14985 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014986
Victor Stinnera47082312012-10-04 02:19:54 +020014987 while (--ctx.fmtcnt >= 0) {
14988 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014989 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014990
14991 nonfmtpos = ctx.fmtpos++;
14992 while (ctx.fmtcnt >= 0 &&
14993 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14994 ctx.fmtpos++;
14995 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014996 }
Victor Stinnera47082312012-10-04 02:19:54 +020014997 if (ctx.fmtcnt < 0) {
14998 ctx.fmtpos--;
14999 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015000 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015001
Victor Stinnercfc4c132013-04-03 01:48:39 +020015002 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15003 nonfmtpos, ctx.fmtpos) < 0)
15004 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015005 }
15006 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015007 ctx.fmtpos++;
15008 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015009 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015010 }
15011 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015012
Victor Stinnera47082312012-10-04 02:19:54 +020015013 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015014 PyErr_SetString(PyExc_TypeError,
15015 "not all arguments converted during string formatting");
15016 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015017 }
15018
Victor Stinnera47082312012-10-04 02:19:54 +020015019 if (ctx.args_owned) {
15020 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015021 }
Victor Stinnera47082312012-10-04 02:19:54 +020015022 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015023
Benjamin Peterson29060642009-01-31 22:14:21 +000015024 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015025 _PyUnicodeWriter_Dealloc(&ctx.writer);
15026 if (ctx.args_owned) {
15027 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015028 }
15029 return NULL;
15030}
15031
Jeremy Hylton938ace62002-07-17 16:30:39 +000015032static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015033unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15034
Tim Peters6d6c1a32001-08-02 04:15:00 +000015035static PyObject *
15036unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15037{
Benjamin Peterson29060642009-01-31 22:14:21 +000015038 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015039 static char *kwlist[] = {"object", "encoding", "errors", 0};
15040 char *encoding = NULL;
15041 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015042
Benjamin Peterson14339b62009-01-31 16:36:08 +000015043 if (type != &PyUnicode_Type)
15044 return unicode_subtype_new(type, args, kwds);
15045 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015046 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015047 return NULL;
15048 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015049 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015050 if (encoding == NULL && errors == NULL)
15051 return PyObject_Str(x);
15052 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015053 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015054}
15055
Guido van Rossume023fe02001-08-30 03:12:59 +000015056static PyObject *
15057unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15058{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015059 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015060 Py_ssize_t length, char_size;
15061 int share_wstr, share_utf8;
15062 unsigned int kind;
15063 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015064
Benjamin Peterson14339b62009-01-31 16:36:08 +000015065 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015066
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015067 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015068 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015069 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015070 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015071 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015072 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015073 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015074 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015075
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015076 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015077 if (self == NULL) {
15078 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015079 return NULL;
15080 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015081 kind = PyUnicode_KIND(unicode);
15082 length = PyUnicode_GET_LENGTH(unicode);
15083
15084 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015085#ifdef Py_DEBUG
15086 _PyUnicode_HASH(self) = -1;
15087#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015088 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015089#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015090 _PyUnicode_STATE(self).interned = 0;
15091 _PyUnicode_STATE(self).kind = kind;
15092 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015093 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015094 _PyUnicode_STATE(self).ready = 1;
15095 _PyUnicode_WSTR(self) = NULL;
15096 _PyUnicode_UTF8_LENGTH(self) = 0;
15097 _PyUnicode_UTF8(self) = NULL;
15098 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015099 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015100
15101 share_utf8 = 0;
15102 share_wstr = 0;
15103 if (kind == PyUnicode_1BYTE_KIND) {
15104 char_size = 1;
15105 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15106 share_utf8 = 1;
15107 }
15108 else if (kind == PyUnicode_2BYTE_KIND) {
15109 char_size = 2;
15110 if (sizeof(wchar_t) == 2)
15111 share_wstr = 1;
15112 }
15113 else {
15114 assert(kind == PyUnicode_4BYTE_KIND);
15115 char_size = 4;
15116 if (sizeof(wchar_t) == 4)
15117 share_wstr = 1;
15118 }
15119
15120 /* Ensure we won't overflow the length. */
15121 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15122 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015123 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015124 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015125 data = PyObject_MALLOC((length + 1) * char_size);
15126 if (data == NULL) {
15127 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015128 goto onError;
15129 }
15130
Victor Stinnerc3c74152011-10-02 20:39:55 +020015131 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015132 if (share_utf8) {
15133 _PyUnicode_UTF8_LENGTH(self) = length;
15134 _PyUnicode_UTF8(self) = data;
15135 }
15136 if (share_wstr) {
15137 _PyUnicode_WSTR_LENGTH(self) = length;
15138 _PyUnicode_WSTR(self) = (wchar_t *)data;
15139 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015140
Christian Heimesf051e432016-09-13 20:22:02 +020015141 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015142 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015143 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015144#ifdef Py_DEBUG
15145 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15146#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015147 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015148 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015149
15150onError:
15151 Py_DECREF(unicode);
15152 Py_DECREF(self);
15153 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015154}
15155
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015156PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015157"str(object='') -> str\n\
15158str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015159\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015160Create a new string object from the given object. If encoding or\n\
15161errors is specified, then the object must expose a data buffer\n\
15162that will be decoded using the given encoding and error handler.\n\
15163Otherwise, returns the result of object.__str__() (if defined)\n\
15164or repr(object).\n\
15165encoding defaults to sys.getdefaultencoding().\n\
15166errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015167
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015168static PyObject *unicode_iter(PyObject *seq);
15169
Guido van Rossumd57fd912000-03-10 22:53:23 +000015170PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015171 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015172 "str", /* tp_name */
15173 sizeof(PyUnicodeObject), /* tp_basicsize */
15174 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015175 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015176 (destructor)unicode_dealloc, /* tp_dealloc */
15177 0, /* tp_print */
15178 0, /* tp_getattr */
15179 0, /* tp_setattr */
15180 0, /* tp_reserved */
15181 unicode_repr, /* tp_repr */
15182 &unicode_as_number, /* tp_as_number */
15183 &unicode_as_sequence, /* tp_as_sequence */
15184 &unicode_as_mapping, /* tp_as_mapping */
15185 (hashfunc) unicode_hash, /* tp_hash*/
15186 0, /* tp_call*/
15187 (reprfunc) unicode_str, /* tp_str */
15188 PyObject_GenericGetAttr, /* tp_getattro */
15189 0, /* tp_setattro */
15190 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015191 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015192 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15193 unicode_doc, /* tp_doc */
15194 0, /* tp_traverse */
15195 0, /* tp_clear */
15196 PyUnicode_RichCompare, /* tp_richcompare */
15197 0, /* tp_weaklistoffset */
15198 unicode_iter, /* tp_iter */
15199 0, /* tp_iternext */
15200 unicode_methods, /* tp_methods */
15201 0, /* tp_members */
15202 0, /* tp_getset */
15203 &PyBaseObject_Type, /* tp_base */
15204 0, /* tp_dict */
15205 0, /* tp_descr_get */
15206 0, /* tp_descr_set */
15207 0, /* tp_dictoffset */
15208 0, /* tp_init */
15209 0, /* tp_alloc */
15210 unicode_new, /* tp_new */
15211 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015212};
15213
15214/* Initialize the Unicode implementation */
15215
Victor Stinner3a50e702011-10-18 21:21:00 +020015216int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015217{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015218 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015219 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015220 0x000A, /* LINE FEED */
15221 0x000D, /* CARRIAGE RETURN */
15222 0x001C, /* FILE SEPARATOR */
15223 0x001D, /* GROUP SEPARATOR */
15224 0x001E, /* RECORD SEPARATOR */
15225 0x0085, /* NEXT LINE */
15226 0x2028, /* LINE SEPARATOR */
15227 0x2029, /* PARAGRAPH SEPARATOR */
15228 };
15229
Fred Drakee4315f52000-05-09 19:53:39 +000015230 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015231 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015232 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015233 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015234 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015235
Guido van Rossumcacfc072002-05-24 19:01:59 +000015236 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015237 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015238
15239 /* initialize the linebreak bloom filter */
15240 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015241 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015242 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015243
Christian Heimes26532f72013-07-20 14:57:16 +020015244 if (PyType_Ready(&EncodingMapType) < 0)
15245 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015246
Benjamin Petersonc4311282012-10-30 23:21:10 -040015247 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15248 Py_FatalError("Can't initialize field name iterator type");
15249
15250 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15251 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015252
Victor Stinner3a50e702011-10-18 21:21:00 +020015253 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015254}
15255
15256/* Finalize the Unicode implementation */
15257
Christian Heimesa156e092008-02-16 07:38:31 +000015258int
15259PyUnicode_ClearFreeList(void)
15260{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015261 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015262}
15263
Guido van Rossumd57fd912000-03-10 22:53:23 +000015264void
Thomas Wouters78890102000-07-22 19:25:51 +000015265_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015266{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015267 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015268
Serhiy Storchaka05997252013-01-26 12:14:02 +020015269 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015270
Serhiy Storchaka05997252013-01-26 12:14:02 +020015271 for (i = 0; i < 256; i++)
15272 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015273 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015274 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015275}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015276
Walter Dörwald16807132007-05-25 13:52:07 +000015277void
15278PyUnicode_InternInPlace(PyObject **p)
15279{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015280 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015281 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015282#ifdef Py_DEBUG
15283 assert(s != NULL);
15284 assert(_PyUnicode_CHECK(s));
15285#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015286 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015287 return;
15288#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015289 /* If it's a subclass, we don't really know what putting
15290 it in the interned dict might do. */
15291 if (!PyUnicode_CheckExact(s))
15292 return;
15293 if (PyUnicode_CHECK_INTERNED(s))
15294 return;
15295 if (interned == NULL) {
15296 interned = PyDict_New();
15297 if (interned == NULL) {
15298 PyErr_Clear(); /* Don't leave an exception */
15299 return;
15300 }
15301 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015302 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015303 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015304 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015305 if (t == NULL) {
15306 PyErr_Clear();
15307 return;
15308 }
15309 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015310 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015311 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015312 return;
15313 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015314 /* The two references in interned are not counted by refcnt.
15315 The deallocator will take care of this */
15316 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015317 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015318}
15319
15320void
15321PyUnicode_InternImmortal(PyObject **p)
15322{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015323 PyUnicode_InternInPlace(p);
15324 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015325 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015326 Py_INCREF(*p);
15327 }
Walter Dörwald16807132007-05-25 13:52:07 +000015328}
15329
15330PyObject *
15331PyUnicode_InternFromString(const char *cp)
15332{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015333 PyObject *s = PyUnicode_FromString(cp);
15334 if (s == NULL)
15335 return NULL;
15336 PyUnicode_InternInPlace(&s);
15337 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015338}
15339
Alexander Belopolsky40018472011-02-26 01:02:56 +000015340void
15341_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015342{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015343 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015344 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015345 Py_ssize_t i, n;
15346 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015347
Benjamin Peterson14339b62009-01-31 16:36:08 +000015348 if (interned == NULL || !PyDict_Check(interned))
15349 return;
15350 keys = PyDict_Keys(interned);
15351 if (keys == NULL || !PyList_Check(keys)) {
15352 PyErr_Clear();
15353 return;
15354 }
Walter Dörwald16807132007-05-25 13:52:07 +000015355
Benjamin Peterson14339b62009-01-31 16:36:08 +000015356 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15357 detector, interned unicode strings are not forcibly deallocated;
15358 rather, we give them their stolen references back, and then clear
15359 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015360
Benjamin Peterson14339b62009-01-31 16:36:08 +000015361 n = PyList_GET_SIZE(keys);
15362 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015363 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015364 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015365 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015366 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015367 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015368 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015369 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015370 case SSTATE_NOT_INTERNED:
15371 /* XXX Shouldn't happen */
15372 break;
15373 case SSTATE_INTERNED_IMMORTAL:
15374 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015375 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015376 break;
15377 case SSTATE_INTERNED_MORTAL:
15378 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015379 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015380 break;
15381 default:
15382 Py_FatalError("Inconsistent interned string state.");
15383 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015384 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015385 }
15386 fprintf(stderr, "total size of all interned strings: "
15387 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15388 "mortal/immortal\n", mortal_size, immortal_size);
15389 Py_DECREF(keys);
15390 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015391 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015392}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015393
15394
15395/********************* Unicode Iterator **************************/
15396
15397typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015398 PyObject_HEAD
15399 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015400 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015401} unicodeiterobject;
15402
15403static void
15404unicodeiter_dealloc(unicodeiterobject *it)
15405{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015406 _PyObject_GC_UNTRACK(it);
15407 Py_XDECREF(it->it_seq);
15408 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015409}
15410
15411static int
15412unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15413{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015414 Py_VISIT(it->it_seq);
15415 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015416}
15417
15418static PyObject *
15419unicodeiter_next(unicodeiterobject *it)
15420{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015421 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015422
Benjamin Peterson14339b62009-01-31 16:36:08 +000015423 assert(it != NULL);
15424 seq = it->it_seq;
15425 if (seq == NULL)
15426 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015427 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015429 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15430 int kind = PyUnicode_KIND(seq);
15431 void *data = PyUnicode_DATA(seq);
15432 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15433 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015434 if (item != NULL)
15435 ++it->it_index;
15436 return item;
15437 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015438
Benjamin Peterson14339b62009-01-31 16:36:08 +000015439 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015440 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015441 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015442}
15443
15444static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015445unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015446{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015447 Py_ssize_t len = 0;
15448 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015449 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015450 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015451}
15452
15453PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15454
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015455static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015456unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015457{
15458 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015459 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015460 it->it_seq, it->it_index);
15461 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015462 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015463 if (u == NULL)
15464 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015465 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015466 }
15467}
15468
15469PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15470
15471static PyObject *
15472unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15473{
15474 Py_ssize_t index = PyLong_AsSsize_t(state);
15475 if (index == -1 && PyErr_Occurred())
15476 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015477 if (it->it_seq != NULL) {
15478 if (index < 0)
15479 index = 0;
15480 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15481 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15482 it->it_index = index;
15483 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015484 Py_RETURN_NONE;
15485}
15486
15487PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15488
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015489static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015490 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015491 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015492 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15493 reduce_doc},
15494 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15495 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015496 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015497};
15498
15499PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015500 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15501 "str_iterator", /* tp_name */
15502 sizeof(unicodeiterobject), /* tp_basicsize */
15503 0, /* tp_itemsize */
15504 /* methods */
15505 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15506 0, /* tp_print */
15507 0, /* tp_getattr */
15508 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015509 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015510 0, /* tp_repr */
15511 0, /* tp_as_number */
15512 0, /* tp_as_sequence */
15513 0, /* tp_as_mapping */
15514 0, /* tp_hash */
15515 0, /* tp_call */
15516 0, /* tp_str */
15517 PyObject_GenericGetAttr, /* tp_getattro */
15518 0, /* tp_setattro */
15519 0, /* tp_as_buffer */
15520 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15521 0, /* tp_doc */
15522 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15523 0, /* tp_clear */
15524 0, /* tp_richcompare */
15525 0, /* tp_weaklistoffset */
15526 PyObject_SelfIter, /* tp_iter */
15527 (iternextfunc)unicodeiter_next, /* tp_iternext */
15528 unicodeiter_methods, /* tp_methods */
15529 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015530};
15531
15532static PyObject *
15533unicode_iter(PyObject *seq)
15534{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015535 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015536
Benjamin Peterson14339b62009-01-31 16:36:08 +000015537 if (!PyUnicode_Check(seq)) {
15538 PyErr_BadInternalCall();
15539 return NULL;
15540 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015541 if (PyUnicode_READY(seq) == -1)
15542 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015543 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15544 if (it == NULL)
15545 return NULL;
15546 it->it_index = 0;
15547 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015548 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015549 _PyObject_GC_TRACK(it);
15550 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015551}
15552
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015553
15554size_t
15555Py_UNICODE_strlen(const Py_UNICODE *u)
15556{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015557 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015558}
15559
15560Py_UNICODE*
15561Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15562{
15563 Py_UNICODE *u = s1;
15564 while ((*u++ = *s2++));
15565 return s1;
15566}
15567
15568Py_UNICODE*
15569Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15570{
15571 Py_UNICODE *u = s1;
15572 while ((*u++ = *s2++))
15573 if (n-- == 0)
15574 break;
15575 return s1;
15576}
15577
15578Py_UNICODE*
15579Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15580{
15581 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015582 u1 += wcslen(u1);
15583 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015584 return s1;
15585}
15586
15587int
15588Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15589{
15590 while (*s1 && *s2 && *s1 == *s2)
15591 s1++, s2++;
15592 if (*s1 && *s2)
15593 return (*s1 < *s2) ? -1 : +1;
15594 if (*s1)
15595 return 1;
15596 if (*s2)
15597 return -1;
15598 return 0;
15599}
15600
15601int
15602Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15603{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015604 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015605 for (; n != 0; n--) {
15606 u1 = *s1;
15607 u2 = *s2;
15608 if (u1 != u2)
15609 return (u1 < u2) ? -1 : +1;
15610 if (u1 == '\0')
15611 return 0;
15612 s1++;
15613 s2++;
15614 }
15615 return 0;
15616}
15617
15618Py_UNICODE*
15619Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15620{
15621 const Py_UNICODE *p;
15622 for (p = s; *p; p++)
15623 if (*p == c)
15624 return (Py_UNICODE*)p;
15625 return NULL;
15626}
15627
15628Py_UNICODE*
15629Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15630{
15631 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015632 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015633 while (p != s) {
15634 p--;
15635 if (*p == c)
15636 return (Py_UNICODE*)p;
15637 }
15638 return NULL;
15639}
Victor Stinner331ea922010-08-10 16:37:20 +000015640
Victor Stinner71133ff2010-09-01 23:43:53 +000015641Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015642PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015643{
Victor Stinner577db2c2011-10-11 22:12:48 +020015644 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015645 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015647 if (!PyUnicode_Check(unicode)) {
15648 PyErr_BadArgument();
15649 return NULL;
15650 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015651 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015652 if (u == NULL)
15653 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015654 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015655 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015656 PyErr_NoMemory();
15657 return NULL;
15658 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015659 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015660 size *= sizeof(Py_UNICODE);
15661 copy = PyMem_Malloc(size);
15662 if (copy == NULL) {
15663 PyErr_NoMemory();
15664 return NULL;
15665 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015666 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015667 return copy;
15668}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015669
Georg Brandl66c221e2010-10-14 07:04:07 +000015670/* A _string module, to export formatter_parser and formatter_field_name_split
15671 to the string.Formatter class implemented in Python. */
15672
15673static PyMethodDef _string_methods[] = {
15674 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15675 METH_O, PyDoc_STR("split the argument as a field name")},
15676 {"formatter_parser", (PyCFunction) formatter_parser,
15677 METH_O, PyDoc_STR("parse the argument as a format string")},
15678 {NULL, NULL}
15679};
15680
15681static struct PyModuleDef _string_module = {
15682 PyModuleDef_HEAD_INIT,
15683 "_string",
15684 PyDoc_STR("string helper module"),
15685 0,
15686 _string_methods,
15687 NULL,
15688 NULL,
15689 NULL,
15690 NULL
15691};
15692
15693PyMODINIT_FUNC
15694PyInit__string(void)
15695{
15696 return PyModule_Create(&_string_module);
15697}
15698
15699
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015700#ifdef __cplusplus
15701}
15702#endif